upgini 1.2.14a1__py3-none-any.whl → 1.2.14a3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- upgini/__about__.py +1 -1
- upgini/dataset.py +5 -3
- upgini/resource_bundle/strings.properties +2 -1
- upgini/utils/target_utils.py +74 -29
- {upgini-1.2.14a1.dist-info → upgini-1.2.14a3.dist-info}/METADATA +1 -1
- {upgini-1.2.14a1.dist-info → upgini-1.2.14a3.dist-info}/RECORD +8 -8
- {upgini-1.2.14a1.dist-info → upgini-1.2.14a3.dist-info}/WHEEL +0 -0
- {upgini-1.2.14a1.dist-info → upgini-1.2.14a3.dist-info}/licenses/LICENSE +0 -0
upgini/__about__.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "1.2.
|
|
1
|
+
__version__ = "1.2.14a3"
|
upgini/dataset.py
CHANGED
|
@@ -53,7 +53,8 @@ class Dataset: # (pd.DataFrame):
|
|
|
53
53
|
FIT_SAMPLE_THRESHOLD = 200_000
|
|
54
54
|
FIT_SAMPLE_WITH_EVAL_SET_ROWS = 200_000
|
|
55
55
|
FIT_SAMPLE_WITH_EVAL_SET_THRESHOLD = 200_000
|
|
56
|
-
|
|
56
|
+
BINARY_MIN_SAMPLE_THRESHOLD = 5_000
|
|
57
|
+
MULTICLASS_MIN_SAMPLE_THRESHOLD = 25_000
|
|
57
58
|
IMBALANCE_THESHOLD = 0.6
|
|
58
59
|
BINARY_BOOTSTRAP_LOOPS = 5
|
|
59
60
|
MULTICLASS_BOOTSTRAP_LOOPS = 2
|
|
@@ -225,7 +226,7 @@ class Dataset: # (pd.DataFrame):
|
|
|
225
226
|
train_segment = self.data
|
|
226
227
|
|
|
227
228
|
if self.task_type == ModelTaskType.MULTICLASS or (
|
|
228
|
-
self.task_type == ModelTaskType.BINARY and len(train_segment) > self.
|
|
229
|
+
self.task_type == ModelTaskType.BINARY and len(train_segment) > self.BINARY_MIN_SAMPLE_THRESHOLD
|
|
229
230
|
):
|
|
230
231
|
count = len(train_segment)
|
|
231
232
|
target_column = self.etalon_def_checked.get(FileColumnMeaningType.TARGET.value, TARGET)
|
|
@@ -261,7 +262,8 @@ class Dataset: # (pd.DataFrame):
|
|
|
261
262
|
target_column=target_column,
|
|
262
263
|
task_type=self.task_type,
|
|
263
264
|
random_state=self.random_state,
|
|
264
|
-
|
|
265
|
+
binary_min_sample_threshold=self.BINARY_MIN_SAMPLE_THRESHOLD,
|
|
266
|
+
multiclass_min_sample_threshold=self.MULTICLASS_MIN_SAMPLE_THRESHOLD,
|
|
265
267
|
binary_bootstrap_loops=self.BINARY_BOOTSTRAP_LOOPS,
|
|
266
268
|
multiclass_bootstrap_loops=self.MULTICLASS_BOOTSTRAP_LOOPS,
|
|
267
269
|
logger=self.logger,
|
|
@@ -203,11 +203,12 @@ email_detected=Emails detected in column `{}`. It will be used as a search key\n
|
|
|
203
203
|
email_detected_not_registered=Emails detected in column `{}`. It can be used only with api_key from profile.upgini.com\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns
|
|
204
204
|
phone_detected=Phone numbers detected in column `{}`. It can be used only with api_key from profile.upgini.com\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns
|
|
205
205
|
phone_detected_not_registered=\nWARNING: Phone numbers detected in column `{}`. It can be used only with api_key from profile.upgini.com\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns
|
|
206
|
-
target_type_detected=\nDetected task type: {}\n
|
|
206
|
+
target_type_detected=\nDetected task type: {}. Reason: {}\nYou can set task type manually with argument `model_task_type` of FeaturesEnricher constructor if task type detected incorrectly\n
|
|
207
207
|
# all_ok_community_invite=Chat with us in Slack community:
|
|
208
208
|
all_ok_community_invite=❓ Support request
|
|
209
209
|
too_small_for_metrics=Your train dataset or one of eval datasets contains less than 500 rows. For such dataset Upgini will not calculate accuracy metrics. Please increase the number of rows in the training dataset to calculate accuracy metrics
|
|
210
210
|
imbalance_multiclass=Class {0} is on 25% quantile of classes distribution ({1} records in train dataset). \nDownsample classes with records more than {1}.
|
|
211
|
+
imbalanced_target=\nWARNING: Target is imbalanced and will be undersampled. Frequency of the rarest class `{}` is {}
|
|
211
212
|
loss_selection_info=Using loss `{}` for feature selection
|
|
212
213
|
loss_calc_metrics_info=Using loss `{}` for metrics calculation with default estimator
|
|
213
214
|
|
upgini/utils/target_utils.py
CHANGED
|
@@ -24,49 +24,87 @@ def define_task(
|
|
|
24
24
|
) -> ModelTaskType:
|
|
25
25
|
if logger is None:
|
|
26
26
|
logger = logging.getLogger()
|
|
27
|
+
|
|
28
|
+
# Replace inf and -inf with NaN to handle extreme values correctly
|
|
29
|
+
y = y.replace([np.inf, -np.inf], np.nan, inplace=False)
|
|
30
|
+
|
|
31
|
+
# Drop NaN values from the target
|
|
27
32
|
target = y.dropna()
|
|
33
|
+
|
|
34
|
+
# Check if target is numeric and finite
|
|
28
35
|
if is_numeric_dtype(target):
|
|
29
36
|
target = target.loc[np.isfinite(target)]
|
|
30
37
|
else:
|
|
38
|
+
# If not numeric, drop empty strings as well
|
|
31
39
|
target = target.loc[target != ""]
|
|
40
|
+
|
|
41
|
+
# Raise error if there are no valid values left in the target
|
|
32
42
|
if len(target) == 0:
|
|
33
43
|
raise ValidationError(bundle.get("empty_target"))
|
|
44
|
+
|
|
45
|
+
# Count unique values in the target
|
|
34
46
|
target_items = target.nunique()
|
|
47
|
+
|
|
48
|
+
# Raise error if all target values are the same
|
|
35
49
|
if target_items == 1:
|
|
36
50
|
raise ValidationError(bundle.get("dataset_constant_target"))
|
|
51
|
+
|
|
52
|
+
reason = "" # Will store the reason for selecting the task type
|
|
53
|
+
|
|
54
|
+
# Binary classification case: exactly two unique values
|
|
37
55
|
if target_items == 2:
|
|
38
56
|
task = ModelTaskType.BINARY
|
|
57
|
+
reason = "only two unique label-values observed"
|
|
39
58
|
else:
|
|
59
|
+
# Attempt to convert target to numeric
|
|
40
60
|
try:
|
|
41
61
|
target = pd.to_numeric(target)
|
|
42
62
|
is_numeric = True
|
|
43
63
|
except Exception:
|
|
44
64
|
is_numeric = False
|
|
45
65
|
|
|
46
|
-
# If
|
|
66
|
+
# If target cannot be converted to numeric, assume multiclass classification
|
|
47
67
|
if not is_numeric:
|
|
48
68
|
task = ModelTaskType.MULTICLASS
|
|
69
|
+
reason = "non-numeric label values observed"
|
|
49
70
|
else:
|
|
71
|
+
# Calculate the ratio of unique values to total number of values
|
|
72
|
+
unique_ratio = target.nunique() / float(len(target))
|
|
73
|
+
|
|
74
|
+
# Multiclass classification: few unique values and integer encoding
|
|
50
75
|
if target.nunique() <= 50 and is_int_encoding(target.unique()):
|
|
51
76
|
task = ModelTaskType.MULTICLASS
|
|
77
|
+
reason = "few unique label-values observed and can be considered as categorical"
|
|
78
|
+
# Regression case: if there are date features, assume regression
|
|
52
79
|
elif has_date:
|
|
53
80
|
task = ModelTaskType.REGRESSION
|
|
81
|
+
reason = "date features are present, treating as regression"
|
|
54
82
|
else:
|
|
83
|
+
# Remove zero values and recalculate unique ratio
|
|
55
84
|
non_zero_target = target[target != 0]
|
|
56
85
|
target_items = non_zero_target.nunique()
|
|
57
86
|
target_ratio = target_items / len(non_zero_target)
|
|
87
|
+
|
|
88
|
+
# Use unique_ratio to determine whether to classify as regression or multiclass
|
|
58
89
|
if (
|
|
59
|
-
|
|
90
|
+
unique_ratio > 0.1 # Use threshold to differentiate between regression and classification
|
|
91
|
+
or (target.dtype.kind == "f" and np.any(target != target.astype(int))) # Non-integer float values
|
|
60
92
|
or target_items > 50
|
|
61
|
-
or target_ratio > 0.2
|
|
93
|
+
or target_ratio > 0.2 # If non-zero values have high ratio of uniqueness
|
|
62
94
|
):
|
|
63
95
|
task = ModelTaskType.REGRESSION
|
|
96
|
+
reason = "many unique label-values or non-integer floating point values observed"
|
|
64
97
|
else:
|
|
65
98
|
task = ModelTaskType.MULTICLASS
|
|
99
|
+
reason = "integer-like values with limited unique values observed"
|
|
100
|
+
|
|
101
|
+
# Log or print the reason for the selected task type
|
|
102
|
+
logger.info(f"Detected task type: {task} (Reason: {reason})")
|
|
66
103
|
|
|
67
|
-
|
|
104
|
+
# Print task type and reason if silent mode is off
|
|
68
105
|
if not silent:
|
|
69
|
-
print(bundle.get("target_type_detected").format(task))
|
|
106
|
+
print(bundle.get("target_type_detected").format(task, reason))
|
|
107
|
+
|
|
70
108
|
return task
|
|
71
109
|
|
|
72
110
|
|
|
@@ -81,8 +119,8 @@ def balance_undersample(
|
|
|
81
119
|
target_column: str,
|
|
82
120
|
task_type: ModelTaskType,
|
|
83
121
|
random_state: int,
|
|
84
|
-
|
|
85
|
-
|
|
122
|
+
binary_min_sample_threshold: int = 5000,
|
|
123
|
+
multiclass_min_sample_threshold: int = 25000,
|
|
86
124
|
binary_bootstrap_loops: int = 5,
|
|
87
125
|
multiclass_bootstrap_loops: int = 2,
|
|
88
126
|
logger: Optional[logging.Logger] = None,
|
|
@@ -96,52 +134,59 @@ def balance_undersample(
|
|
|
96
134
|
if SYSTEM_RECORD_ID not in df.columns:
|
|
97
135
|
raise Exception("System record id must be presented for undersampling")
|
|
98
136
|
|
|
99
|
-
count = len(df)
|
|
137
|
+
# count = len(df)
|
|
100
138
|
target = df[target_column].copy()
|
|
101
|
-
target_classes_count = target.nunique()
|
|
139
|
+
# target_classes_count = target.nunique()
|
|
102
140
|
|
|
103
141
|
vc = target.value_counts()
|
|
104
142
|
max_class_value = vc.index[0]
|
|
105
143
|
min_class_value = vc.index[len(vc) - 1]
|
|
106
144
|
max_class_count = vc[max_class_value]
|
|
107
145
|
min_class_count = vc[min_class_value]
|
|
146
|
+
num_classes = len(vc)
|
|
108
147
|
|
|
109
|
-
min_class_percent = imbalance_threshold / target_classes_count
|
|
110
|
-
min_class_threshold = int(min_class_percent * count)
|
|
148
|
+
# min_class_percent = imbalance_threshold / target_classes_count
|
|
149
|
+
# min_class_threshold = int(min_class_percent * count)
|
|
111
150
|
|
|
112
151
|
resampled_data = df
|
|
113
152
|
df = df.copy().sort_values(by=SYSTEM_RECORD_ID)
|
|
114
153
|
if task_type == ModelTaskType.MULTICLASS:
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
if max_class_count > (quantile25_class_cnt * multiclass_bootstrap_loops):
|
|
122
|
-
msg = bundle.get("imbalance_multiclass").format(quantile25_class, quantile25_class_cnt)
|
|
154
|
+
if len(df) > multiclass_min_sample_threshold and max_class_count > (
|
|
155
|
+
min_class_count * multiclass_bootstrap_loops
|
|
156
|
+
):
|
|
157
|
+
|
|
158
|
+
# msg = bundle.get("imbalance_multiclass").format(min_class_value, min_class_count)
|
|
159
|
+
msg = bundle.get("imbalanced_target").format(min_class_value, min_class_count)
|
|
123
160
|
logger.warning(msg)
|
|
124
161
|
print(msg)
|
|
125
162
|
if warning_counter:
|
|
126
163
|
warning_counter.increment()
|
|
127
164
|
|
|
128
|
-
# 25% and lower classes will stay as is. Higher classes will be downsampled
|
|
129
165
|
sample_strategy = dict()
|
|
130
|
-
for
|
|
131
|
-
|
|
132
|
-
|
|
166
|
+
for class_value in vc.index:
|
|
167
|
+
if class_value == min_class_value:
|
|
168
|
+
continue
|
|
133
169
|
class_count = vc[class_value]
|
|
134
|
-
|
|
170
|
+
sample_size = min(
|
|
171
|
+
class_count,
|
|
172
|
+
multiclass_bootstrap_loops
|
|
173
|
+
* (
|
|
174
|
+
min_class_count
|
|
175
|
+
+ max((multiclass_min_sample_threshold - num_classes * min_class_count) / (num_classes - 1), 0)
|
|
176
|
+
),
|
|
177
|
+
)
|
|
178
|
+
sample_strategy[class_value] = int(sample_size)
|
|
135
179
|
sampler = RandomUnderSampler(sampling_strategy=sample_strategy, random_state=random_state)
|
|
136
180
|
X = df[SYSTEM_RECORD_ID]
|
|
137
181
|
X = X.to_frame(SYSTEM_RECORD_ID)
|
|
138
182
|
new_x, _ = sampler.fit_resample(X, target) # type: ignore
|
|
139
183
|
|
|
140
184
|
resampled_data = df[df[SYSTEM_RECORD_ID].isin(new_x[SYSTEM_RECORD_ID])]
|
|
141
|
-
elif len(df) >
|
|
142
|
-
msg = bundle.get("dataset_rarest_class_less_threshold").format(
|
|
143
|
-
|
|
144
|
-
)
|
|
185
|
+
elif len(df) > binary_min_sample_threshold:
|
|
186
|
+
# msg = bundle.get("dataset_rarest_class_less_threshold").format(
|
|
187
|
+
# min_class_value, min_class_count, min_class_threshold, min_class_percent * 100
|
|
188
|
+
# )
|
|
189
|
+
msg = bundle.get("imbalanced_target").format(min_class_value, min_class_count)
|
|
145
190
|
logger.warning(msg)
|
|
146
191
|
print(msg)
|
|
147
192
|
if warning_counter:
|
|
@@ -153,7 +198,7 @@ def balance_undersample(
|
|
|
153
198
|
# sample_size = min(len(majority_class), min_sample_threshold - min_class_count)
|
|
154
199
|
sample_size = min(
|
|
155
200
|
max_class_count,
|
|
156
|
-
binary_bootstrap_loops * (min_class_count + max(
|
|
201
|
+
binary_bootstrap_loops * (min_class_count + max(binary_min_sample_threshold - 2 * min_class_count, 0)),
|
|
157
202
|
)
|
|
158
203
|
sampled_majority_class = majority_class.sample(n=sample_size, random_state=random_state)
|
|
159
204
|
resampled_data = df[
|
|
@@ -1,7 +1,7 @@
|
|
|
1
|
-
upgini/__about__.py,sha256=
|
|
1
|
+
upgini/__about__.py,sha256=FtGXNn_dLawjJC8X1Icai3ZGyctjvAhmviJGa0zgVTM,25
|
|
2
2
|
upgini/__init__.py,sha256=M64LwQTBa-5Jz24Zm2h8rWwlKQQ1J8nP7gGgIciS0WU,589
|
|
3
3
|
upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
|
|
4
|
-
upgini/dataset.py,sha256=
|
|
4
|
+
upgini/dataset.py,sha256=iPFiMJtk4HF1ytw9wCQr8H9RfoOKj_TIo8XYZKWgcMc,31331
|
|
5
5
|
upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
|
|
6
6
|
upgini/features_enricher.py,sha256=HJJZbZScVrl6ugDBQE71m7om5-ahvMyEnAqZNw-OEJ0,188058
|
|
7
7
|
upgini/http.py,sha256=21asexflvavydzCOONJDGQBtQanCElrbnqLXakJ9Cu8,42880
|
|
@@ -30,7 +30,7 @@ upgini/normalizer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU
|
|
|
30
30
|
upgini/normalizer/normalize_utils.py,sha256=bHRPWCNrUvt2R9qMX6dZFCJ0i8ENVCQ2Rw3dHH9IJEg,7447
|
|
31
31
|
upgini/resource_bundle/__init__.py,sha256=S5F2G47pnJd2LDpmFsjDqEwiKkP8Hm-hcseDbMka6Ko,8345
|
|
32
32
|
upgini/resource_bundle/exceptions.py,sha256=5fRvx0_vWdE1-7HcSgF0tckB4A9AKyf5RiinZkInTsI,621
|
|
33
|
-
upgini/resource_bundle/strings.properties,sha256=
|
|
33
|
+
upgini/resource_bundle/strings.properties,sha256=uo9CIQMg8VFeHlL_mY2dwOumQnr0TenJNPNOfXPWlPI,26715
|
|
34
34
|
upgini/resource_bundle/strings_widget.properties,sha256=gOdqvZWntP2LCza_tyVk1_yRYcG4c04K9sQOAVhF_gw,1577
|
|
35
35
|
upgini/sampler/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
36
36
|
upgini/sampler/base.py,sha256=7GpjYqjOp58vYcJLiX__1R5wjUlyQbxvHJ2klFnup_M,6389
|
|
@@ -54,10 +54,10 @@ upgini/utils/phone_utils.py,sha256=IrbztLuOJBiePqqxllfABWfYlfAjYevPhXKipl95wUI,1
|
|
|
54
54
|
upgini/utils/postal_code_utils.py,sha256=5M0sUqH2DAr33kARWCTXR-ACyzWbjDq_-0mmEml6ZcU,1716
|
|
55
55
|
upgini/utils/progress_bar.py,sha256=N-Sfdah2Hg8lXP_fV9EfUTXz_PyRt4lo9fAHoUDOoLc,1550
|
|
56
56
|
upgini/utils/sklearn_ext.py,sha256=13jQS_k7v0aUtudXV6nGUEWjttPQzAW9AFYL5wgEz9k,44511
|
|
57
|
-
upgini/utils/target_utils.py,sha256=
|
|
57
|
+
upgini/utils/target_utils.py,sha256=FsGRmZ2B9-Y1Mnkh2esz41nNgAUX2nleQRUmuDzjk_s,10133
|
|
58
58
|
upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
|
|
59
59
|
upgini/utils/warning_counter.py,sha256=dIWBB4dI5XRRJZudvIlqlIYKEiwLLPcXarsZuYRt338,227
|
|
60
|
-
upgini-1.2.
|
|
61
|
-
upgini-1.2.
|
|
62
|
-
upgini-1.2.
|
|
63
|
-
upgini-1.2.
|
|
60
|
+
upgini-1.2.14a3.dist-info/METADATA,sha256=JIjXqPhg5R96xFOY1uDijJpO1LHxwkb-sh8Bf7Sc394,48579
|
|
61
|
+
upgini-1.2.14a3.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
|
|
62
|
+
upgini-1.2.14a3.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
|
|
63
|
+
upgini-1.2.14a3.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|