upgini 1.2.14a2__py3-none-any.whl → 1.2.14a4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- upgini/__about__.py +1 -1
- upgini/resource_bundle/strings.properties +1 -1
- upgini/utils/target_utils.py +48 -5
- {upgini-1.2.14a2.dist-info → upgini-1.2.14a4.dist-info}/METADATA +1 -1
- {upgini-1.2.14a2.dist-info → upgini-1.2.14a4.dist-info}/RECORD +7 -7
- {upgini-1.2.14a2.dist-info → upgini-1.2.14a4.dist-info}/WHEEL +0 -0
- {upgini-1.2.14a2.dist-info → upgini-1.2.14a4.dist-info}/licenses/LICENSE +0 -0
upgini/__about__.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "1.2.
|
|
1
|
+
__version__ = "1.2.14a4"
|
|
@@ -203,7 +203,7 @@ email_detected=Emails detected in column `{}`. It will be used as a search key\n
|
|
|
203
203
|
email_detected_not_registered=Emails detected in column `{}`. It can be used only with api_key from profile.upgini.com\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns
|
|
204
204
|
phone_detected=Phone numbers detected in column `{}`. It can be used only with api_key from profile.upgini.com\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns
|
|
205
205
|
phone_detected_not_registered=\nWARNING: Phone numbers detected in column `{}`. It can be used only with api_key from profile.upgini.com\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns
|
|
206
|
-
target_type_detected=\nDetected task type: {}\n
|
|
206
|
+
target_type_detected=\nDetected task type: {}. Reason: {}\nYou can set task type manually with argument `model_task_type` of FeaturesEnricher constructor if task type detected incorrectly\n
|
|
207
207
|
# all_ok_community_invite=Chat with us in Slack community:
|
|
208
208
|
all_ok_community_invite=❓ Support request
|
|
209
209
|
too_small_for_metrics=Your train dataset or one of eval datasets contains less than 500 rows. For such dataset Upgini will not calculate accuracy metrics. Please increase the number of rows in the training dataset to calculate accuracy metrics
|
upgini/utils/target_utils.py
CHANGED
|
@@ -24,49 +24,87 @@ def define_task(
|
|
|
24
24
|
) -> ModelTaskType:
|
|
25
25
|
if logger is None:
|
|
26
26
|
logger = logging.getLogger()
|
|
27
|
+
|
|
28
|
+
# Replace inf and -inf with NaN to handle extreme values correctly
|
|
29
|
+
y = y.replace([np.inf, -np.inf], np.nan, inplace=False)
|
|
30
|
+
|
|
31
|
+
# Drop NaN values from the target
|
|
27
32
|
target = y.dropna()
|
|
33
|
+
|
|
34
|
+
# Check if target is numeric and finite
|
|
28
35
|
if is_numeric_dtype(target):
|
|
29
36
|
target = target.loc[np.isfinite(target)]
|
|
30
37
|
else:
|
|
38
|
+
# If not numeric, drop empty strings as well
|
|
31
39
|
target = target.loc[target != ""]
|
|
40
|
+
|
|
41
|
+
# Raise error if there are no valid values left in the target
|
|
32
42
|
if len(target) == 0:
|
|
33
43
|
raise ValidationError(bundle.get("empty_target"))
|
|
44
|
+
|
|
45
|
+
# Count unique values in the target
|
|
34
46
|
target_items = target.nunique()
|
|
47
|
+
|
|
48
|
+
# Raise error if all target values are the same
|
|
35
49
|
if target_items == 1:
|
|
36
50
|
raise ValidationError(bundle.get("dataset_constant_target"))
|
|
51
|
+
|
|
52
|
+
reason = "" # Will store the reason for selecting the task type
|
|
53
|
+
|
|
54
|
+
# Binary classification case: exactly two unique values
|
|
37
55
|
if target_items == 2:
|
|
38
56
|
task = ModelTaskType.BINARY
|
|
57
|
+
reason = "only two unique label-values observed"
|
|
39
58
|
else:
|
|
59
|
+
# Attempt to convert target to numeric
|
|
40
60
|
try:
|
|
41
61
|
target = pd.to_numeric(target)
|
|
42
62
|
is_numeric = True
|
|
43
63
|
except Exception:
|
|
44
64
|
is_numeric = False
|
|
45
65
|
|
|
46
|
-
# If
|
|
66
|
+
# If target cannot be converted to numeric, assume multiclass classification
|
|
47
67
|
if not is_numeric:
|
|
48
68
|
task = ModelTaskType.MULTICLASS
|
|
69
|
+
reason = "non-numeric label values observed"
|
|
49
70
|
else:
|
|
71
|
+
# Calculate the ratio of unique values to total number of values
|
|
72
|
+
unique_ratio = target.nunique() / float(len(target))
|
|
73
|
+
|
|
74
|
+
# Multiclass classification: few unique values and integer encoding
|
|
50
75
|
if target.nunique() <= 50 and is_int_encoding(target.unique()):
|
|
51
76
|
task = ModelTaskType.MULTICLASS
|
|
77
|
+
reason = "few unique label-values observed and can be considered as categorical"
|
|
78
|
+
# Regression case: if there are date features, assume regression
|
|
52
79
|
elif has_date:
|
|
53
80
|
task = ModelTaskType.REGRESSION
|
|
81
|
+
reason = "date features are present, treating as regression"
|
|
54
82
|
else:
|
|
83
|
+
# Remove zero values and recalculate unique ratio
|
|
55
84
|
non_zero_target = target[target != 0]
|
|
56
85
|
target_items = non_zero_target.nunique()
|
|
57
86
|
target_ratio = target_items / len(non_zero_target)
|
|
87
|
+
|
|
88
|
+
# Use unique_ratio to determine whether to classify as regression or multiclass
|
|
58
89
|
if (
|
|
59
|
-
|
|
90
|
+
unique_ratio > 0.1 # Use threshold to differentiate between regression and classification
|
|
91
|
+
or (target.dtype.kind == "f" and np.any(target != target.astype(int))) # Non-integer float values
|
|
60
92
|
or target_items > 50
|
|
61
|
-
or target_ratio > 0.2
|
|
93
|
+
or target_ratio > 0.2 # If non-zero values have high ratio of uniqueness
|
|
62
94
|
):
|
|
63
95
|
task = ModelTaskType.REGRESSION
|
|
96
|
+
reason = "many unique label-values or non-integer floating point values observed"
|
|
64
97
|
else:
|
|
65
98
|
task = ModelTaskType.MULTICLASS
|
|
99
|
+
reason = "integer-like values with limited unique values observed"
|
|
66
100
|
|
|
67
|
-
|
|
101
|
+
# Log or print the reason for the selected task type
|
|
102
|
+
logger.info(f"Detected task type: {task} (Reason: {reason})")
|
|
103
|
+
|
|
104
|
+
# Print task type and reason if silent mode is off
|
|
68
105
|
if not silent:
|
|
69
|
-
print(bundle.get("target_type_detected").format(task))
|
|
106
|
+
print(bundle.get("target_type_detected").format(task, reason))
|
|
107
|
+
|
|
70
108
|
return task
|
|
71
109
|
|
|
72
110
|
|
|
@@ -138,6 +176,7 @@ def balance_undersample(
|
|
|
138
176
|
),
|
|
139
177
|
)
|
|
140
178
|
sample_strategy[class_value] = int(sample_size)
|
|
179
|
+
logger.info(f"Rebalance sample strategy: {sample_strategy}. Min class count: {min_class_count}")
|
|
141
180
|
sampler = RandomUnderSampler(sampling_strategy=sample_strategy, random_state=random_state)
|
|
142
181
|
X = df[SYSTEM_RECORD_ID]
|
|
143
182
|
X = X.to_frame(SYSTEM_RECORD_ID)
|
|
@@ -162,6 +201,10 @@ def balance_undersample(
|
|
|
162
201
|
max_class_count,
|
|
163
202
|
binary_bootstrap_loops * (min_class_count + max(binary_min_sample_threshold - 2 * min_class_count, 0)),
|
|
164
203
|
)
|
|
204
|
+
logger.info(
|
|
205
|
+
f"Min class count: {min_class_count}. Max class count: {max_class_count}."
|
|
206
|
+
f" Rebalance sample size: {sample_size}"
|
|
207
|
+
)
|
|
165
208
|
sampled_majority_class = majority_class.sample(n=sample_size, random_state=random_state)
|
|
166
209
|
resampled_data = df[
|
|
167
210
|
(df[SYSTEM_RECORD_ID].isin(minority_class[SYSTEM_RECORD_ID]))
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
upgini/__about__.py,sha256=
|
|
1
|
+
upgini/__about__.py,sha256=tfhdEEoOzTUSKNF9hQy8PZO57ri0xEeduAwFCwtVLCg,25
|
|
2
2
|
upgini/__init__.py,sha256=M64LwQTBa-5Jz24Zm2h8rWwlKQQ1J8nP7gGgIciS0WU,589
|
|
3
3
|
upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
|
|
4
4
|
upgini/dataset.py,sha256=iPFiMJtk4HF1ytw9wCQr8H9RfoOKj_TIo8XYZKWgcMc,31331
|
|
@@ -30,7 +30,7 @@ upgini/normalizer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU
|
|
|
30
30
|
upgini/normalizer/normalize_utils.py,sha256=bHRPWCNrUvt2R9qMX6dZFCJ0i8ENVCQ2Rw3dHH9IJEg,7447
|
|
31
31
|
upgini/resource_bundle/__init__.py,sha256=S5F2G47pnJd2LDpmFsjDqEwiKkP8Hm-hcseDbMka6Ko,8345
|
|
32
32
|
upgini/resource_bundle/exceptions.py,sha256=5fRvx0_vWdE1-7HcSgF0tckB4A9AKyf5RiinZkInTsI,621
|
|
33
|
-
upgini/resource_bundle/strings.properties,sha256=
|
|
33
|
+
upgini/resource_bundle/strings.properties,sha256=uo9CIQMg8VFeHlL_mY2dwOumQnr0TenJNPNOfXPWlPI,26715
|
|
34
34
|
upgini/resource_bundle/strings_widget.properties,sha256=gOdqvZWntP2LCza_tyVk1_yRYcG4c04K9sQOAVhF_gw,1577
|
|
35
35
|
upgini/sampler/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
36
36
|
upgini/sampler/base.py,sha256=7GpjYqjOp58vYcJLiX__1R5wjUlyQbxvHJ2klFnup_M,6389
|
|
@@ -54,10 +54,10 @@ upgini/utils/phone_utils.py,sha256=IrbztLuOJBiePqqxllfABWfYlfAjYevPhXKipl95wUI,1
|
|
|
54
54
|
upgini/utils/postal_code_utils.py,sha256=5M0sUqH2DAr33kARWCTXR-ACyzWbjDq_-0mmEml6ZcU,1716
|
|
55
55
|
upgini/utils/progress_bar.py,sha256=N-Sfdah2Hg8lXP_fV9EfUTXz_PyRt4lo9fAHoUDOoLc,1550
|
|
56
56
|
upgini/utils/sklearn_ext.py,sha256=13jQS_k7v0aUtudXV6nGUEWjttPQzAW9AFYL5wgEz9k,44511
|
|
57
|
-
upgini/utils/target_utils.py,sha256=
|
|
57
|
+
upgini/utils/target_utils.py,sha256=3itoOxwEycnIdWeTL3KjuS_NdJleL6nMRqblQLmy6Kk,10413
|
|
58
58
|
upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
|
|
59
59
|
upgini/utils/warning_counter.py,sha256=dIWBB4dI5XRRJZudvIlqlIYKEiwLLPcXarsZuYRt338,227
|
|
60
|
-
upgini-1.2.
|
|
61
|
-
upgini-1.2.
|
|
62
|
-
upgini-1.2.
|
|
63
|
-
upgini-1.2.
|
|
60
|
+
upgini-1.2.14a4.dist-info/METADATA,sha256=iwhYx9Mru7TEEylcJbeOaZlnKVBcpMrnYXzNEU2M4fg,48579
|
|
61
|
+
upgini-1.2.14a4.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
|
|
62
|
+
upgini-1.2.14a4.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
|
|
63
|
+
upgini-1.2.14a4.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|