upgini 1.2.14a1__tar.gz → 1.2.14a2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {upgini-1.2.14a1 → upgini-1.2.14a2}/PKG-INFO +1 -1
- upgini-1.2.14a2/src/upgini/__about__.py +1 -0
- {upgini-1.2.14a1 → upgini-1.2.14a2}/src/upgini/dataset.py +5 -3
- {upgini-1.2.14a1 → upgini-1.2.14a2}/src/upgini/resource_bundle/strings.properties +1 -0
- {upgini-1.2.14a1 → upgini-1.2.14a2}/src/upgini/utils/target_utils.py +31 -24
- upgini-1.2.14a1/src/upgini/__about__.py +0 -1
- {upgini-1.2.14a1 → upgini-1.2.14a2}/.gitignore +0 -0
- {upgini-1.2.14a1 → upgini-1.2.14a2}/LICENSE +0 -0
- {upgini-1.2.14a1 → upgini-1.2.14a2}/README.md +0 -0
- {upgini-1.2.14a1 → upgini-1.2.14a2}/pyproject.toml +0 -0
- {upgini-1.2.14a1 → upgini-1.2.14a2}/src/upgini/__init__.py +0 -0
- {upgini-1.2.14a1 → upgini-1.2.14a2}/src/upgini/ads.py +0 -0
- {upgini-1.2.14a1 → upgini-1.2.14a2}/src/upgini/ads_management/__init__.py +0 -0
- {upgini-1.2.14a1 → upgini-1.2.14a2}/src/upgini/ads_management/ads_manager.py +0 -0
- {upgini-1.2.14a1 → upgini-1.2.14a2}/src/upgini/autofe/__init__.py +0 -0
- {upgini-1.2.14a1 → upgini-1.2.14a2}/src/upgini/autofe/all_operands.py +0 -0
- {upgini-1.2.14a1 → upgini-1.2.14a2}/src/upgini/autofe/binary.py +0 -0
- {upgini-1.2.14a1 → upgini-1.2.14a2}/src/upgini/autofe/date.py +0 -0
- {upgini-1.2.14a1 → upgini-1.2.14a2}/src/upgini/autofe/feature.py +0 -0
- {upgini-1.2.14a1 → upgini-1.2.14a2}/src/upgini/autofe/groupby.py +0 -0
- {upgini-1.2.14a1 → upgini-1.2.14a2}/src/upgini/autofe/operand.py +0 -0
- {upgini-1.2.14a1 → upgini-1.2.14a2}/src/upgini/autofe/unary.py +0 -0
- {upgini-1.2.14a1 → upgini-1.2.14a2}/src/upgini/autofe/vector.py +0 -0
- {upgini-1.2.14a1 → upgini-1.2.14a2}/src/upgini/data_source/__init__.py +0 -0
- {upgini-1.2.14a1 → upgini-1.2.14a2}/src/upgini/data_source/data_source_publisher.py +0 -0
- {upgini-1.2.14a1 → upgini-1.2.14a2}/src/upgini/errors.py +0 -0
- {upgini-1.2.14a1 → upgini-1.2.14a2}/src/upgini/features_enricher.py +0 -0
- {upgini-1.2.14a1 → upgini-1.2.14a2}/src/upgini/http.py +0 -0
- {upgini-1.2.14a1 → upgini-1.2.14a2}/src/upgini/lazy_import.py +0 -0
- {upgini-1.2.14a1 → upgini-1.2.14a2}/src/upgini/mdc/__init__.py +0 -0
- {upgini-1.2.14a1 → upgini-1.2.14a2}/src/upgini/mdc/context.py +0 -0
- {upgini-1.2.14a1 → upgini-1.2.14a2}/src/upgini/metadata.py +0 -0
- {upgini-1.2.14a1 → upgini-1.2.14a2}/src/upgini/metrics.py +0 -0
- {upgini-1.2.14a1 → upgini-1.2.14a2}/src/upgini/normalizer/__init__.py +0 -0
- {upgini-1.2.14a1 → upgini-1.2.14a2}/src/upgini/normalizer/normalize_utils.py +0 -0
- {upgini-1.2.14a1 → upgini-1.2.14a2}/src/upgini/resource_bundle/__init__.py +0 -0
- {upgini-1.2.14a1 → upgini-1.2.14a2}/src/upgini/resource_bundle/exceptions.py +0 -0
- {upgini-1.2.14a1 → upgini-1.2.14a2}/src/upgini/resource_bundle/strings_widget.properties +0 -0
- {upgini-1.2.14a1 → upgini-1.2.14a2}/src/upgini/sampler/__init__.py +0 -0
- {upgini-1.2.14a1 → upgini-1.2.14a2}/src/upgini/sampler/base.py +0 -0
- {upgini-1.2.14a1 → upgini-1.2.14a2}/src/upgini/sampler/random_under_sampler.py +0 -0
- {upgini-1.2.14a1 → upgini-1.2.14a2}/src/upgini/sampler/utils.py +0 -0
- {upgini-1.2.14a1 → upgini-1.2.14a2}/src/upgini/search_task.py +0 -0
- {upgini-1.2.14a1 → upgini-1.2.14a2}/src/upgini/spinner.py +0 -0
- {upgini-1.2.14a1 → upgini-1.2.14a2}/src/upgini/utils/__init__.py +0 -0
- {upgini-1.2.14a1 → upgini-1.2.14a2}/src/upgini/utils/base_search_key_detector.py +0 -0
- {upgini-1.2.14a1 → upgini-1.2.14a2}/src/upgini/utils/blocked_time_series.py +0 -0
- {upgini-1.2.14a1 → upgini-1.2.14a2}/src/upgini/utils/country_utils.py +0 -0
- {upgini-1.2.14a1 → upgini-1.2.14a2}/src/upgini/utils/custom_loss_utils.py +0 -0
- {upgini-1.2.14a1 → upgini-1.2.14a2}/src/upgini/utils/cv_utils.py +0 -0
- {upgini-1.2.14a1 → upgini-1.2.14a2}/src/upgini/utils/datetime_utils.py +0 -0
- {upgini-1.2.14a1 → upgini-1.2.14a2}/src/upgini/utils/deduplicate_utils.py +0 -0
- {upgini-1.2.14a1 → upgini-1.2.14a2}/src/upgini/utils/display_utils.py +0 -0
- {upgini-1.2.14a1 → upgini-1.2.14a2}/src/upgini/utils/email_utils.py +0 -0
- {upgini-1.2.14a1 → upgini-1.2.14a2}/src/upgini/utils/fallback_progress_bar.py +0 -0
- {upgini-1.2.14a1 → upgini-1.2.14a2}/src/upgini/utils/features_validator.py +0 -0
- {upgini-1.2.14a1 → upgini-1.2.14a2}/src/upgini/utils/format.py +0 -0
- {upgini-1.2.14a1 → upgini-1.2.14a2}/src/upgini/utils/ip_utils.py +0 -0
- {upgini-1.2.14a1 → upgini-1.2.14a2}/src/upgini/utils/phone_utils.py +0 -0
- {upgini-1.2.14a1 → upgini-1.2.14a2}/src/upgini/utils/postal_code_utils.py +0 -0
- {upgini-1.2.14a1 → upgini-1.2.14a2}/src/upgini/utils/progress_bar.py +0 -0
- {upgini-1.2.14a1 → upgini-1.2.14a2}/src/upgini/utils/sklearn_ext.py +0 -0
- {upgini-1.2.14a1 → upgini-1.2.14a2}/src/upgini/utils/track_info.py +0 -0
- {upgini-1.2.14a1 → upgini-1.2.14a2}/src/upgini/utils/warning_counter.py +0 -0
- {upgini-1.2.14a1 → upgini-1.2.14a2}/src/upgini/version_validator.py +0 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "1.2.14a2"
|
|
@@ -53,7 +53,8 @@ class Dataset: # (pd.DataFrame):
|
|
|
53
53
|
FIT_SAMPLE_THRESHOLD = 200_000
|
|
54
54
|
FIT_SAMPLE_WITH_EVAL_SET_ROWS = 200_000
|
|
55
55
|
FIT_SAMPLE_WITH_EVAL_SET_THRESHOLD = 200_000
|
|
56
|
-
|
|
56
|
+
BINARY_MIN_SAMPLE_THRESHOLD = 5_000
|
|
57
|
+
MULTICLASS_MIN_SAMPLE_THRESHOLD = 25_000
|
|
57
58
|
IMBALANCE_THESHOLD = 0.6
|
|
58
59
|
BINARY_BOOTSTRAP_LOOPS = 5
|
|
59
60
|
MULTICLASS_BOOTSTRAP_LOOPS = 2
|
|
@@ -225,7 +226,7 @@ class Dataset: # (pd.DataFrame):
|
|
|
225
226
|
train_segment = self.data
|
|
226
227
|
|
|
227
228
|
if self.task_type == ModelTaskType.MULTICLASS or (
|
|
228
|
-
self.task_type == ModelTaskType.BINARY and len(train_segment) > self.
|
|
229
|
+
self.task_type == ModelTaskType.BINARY and len(train_segment) > self.BINARY_MIN_SAMPLE_THRESHOLD
|
|
229
230
|
):
|
|
230
231
|
count = len(train_segment)
|
|
231
232
|
target_column = self.etalon_def_checked.get(FileColumnMeaningType.TARGET.value, TARGET)
|
|
@@ -261,7 +262,8 @@ class Dataset: # (pd.DataFrame):
|
|
|
261
262
|
target_column=target_column,
|
|
262
263
|
task_type=self.task_type,
|
|
263
264
|
random_state=self.random_state,
|
|
264
|
-
|
|
265
|
+
binary_min_sample_threshold=self.BINARY_MIN_SAMPLE_THRESHOLD,
|
|
266
|
+
multiclass_min_sample_threshold=self.MULTICLASS_MIN_SAMPLE_THRESHOLD,
|
|
265
267
|
binary_bootstrap_loops=self.BINARY_BOOTSTRAP_LOOPS,
|
|
266
268
|
multiclass_bootstrap_loops=self.MULTICLASS_BOOTSTRAP_LOOPS,
|
|
267
269
|
logger=self.logger,
|
|
@@ -208,6 +208,7 @@ target_type_detected=\nDetected task type: {}\n
|
|
|
208
208
|
all_ok_community_invite=❓ Support request
|
|
209
209
|
too_small_for_metrics=Your train dataset or one of eval datasets contains less than 500 rows. For such dataset Upgini will not calculate accuracy metrics. Please increase the number of rows in the training dataset to calculate accuracy metrics
|
|
210
210
|
imbalance_multiclass=Class {0} is on 25% quantile of classes distribution ({1} records in train dataset). \nDownsample classes with records more than {1}.
|
|
211
|
+
imbalanced_target=\nWARNING: Target is imbalanced and will be undersampled. Frequency of the rarest class `{}` is {}
|
|
211
212
|
loss_selection_info=Using loss `{}` for feature selection
|
|
212
213
|
loss_calc_metrics_info=Using loss `{}` for metrics calculation with default estimator
|
|
213
214
|
|
|
@@ -81,8 +81,8 @@ def balance_undersample(
|
|
|
81
81
|
target_column: str,
|
|
82
82
|
task_type: ModelTaskType,
|
|
83
83
|
random_state: int,
|
|
84
|
-
|
|
85
|
-
|
|
84
|
+
binary_min_sample_threshold: int = 5000,
|
|
85
|
+
multiclass_min_sample_threshold: int = 25000,
|
|
86
86
|
binary_bootstrap_loops: int = 5,
|
|
87
87
|
multiclass_bootstrap_loops: int = 2,
|
|
88
88
|
logger: Optional[logging.Logger] = None,
|
|
@@ -96,52 +96,59 @@ def balance_undersample(
|
|
|
96
96
|
if SYSTEM_RECORD_ID not in df.columns:
|
|
97
97
|
raise Exception("System record id must be presented for undersampling")
|
|
98
98
|
|
|
99
|
-
count = len(df)
|
|
99
|
+
# count = len(df)
|
|
100
100
|
target = df[target_column].copy()
|
|
101
|
-
target_classes_count = target.nunique()
|
|
101
|
+
# target_classes_count = target.nunique()
|
|
102
102
|
|
|
103
103
|
vc = target.value_counts()
|
|
104
104
|
max_class_value = vc.index[0]
|
|
105
105
|
min_class_value = vc.index[len(vc) - 1]
|
|
106
106
|
max_class_count = vc[max_class_value]
|
|
107
107
|
min_class_count = vc[min_class_value]
|
|
108
|
+
num_classes = len(vc)
|
|
108
109
|
|
|
109
|
-
min_class_percent = imbalance_threshold / target_classes_count
|
|
110
|
-
min_class_threshold = int(min_class_percent * count)
|
|
110
|
+
# min_class_percent = imbalance_threshold / target_classes_count
|
|
111
|
+
# min_class_threshold = int(min_class_percent * count)
|
|
111
112
|
|
|
112
113
|
resampled_data = df
|
|
113
114
|
df = df.copy().sort_values(by=SYSTEM_RECORD_ID)
|
|
114
115
|
if task_type == ModelTaskType.MULTICLASS:
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
if max_class_count > (quantile25_class_cnt * multiclass_bootstrap_loops):
|
|
122
|
-
msg = bundle.get("imbalance_multiclass").format(quantile25_class, quantile25_class_cnt)
|
|
116
|
+
if len(df) > multiclass_min_sample_threshold and max_class_count > (
|
|
117
|
+
min_class_count * multiclass_bootstrap_loops
|
|
118
|
+
):
|
|
119
|
+
|
|
120
|
+
# msg = bundle.get("imbalance_multiclass").format(min_class_value, min_class_count)
|
|
121
|
+
msg = bundle.get("imbalanced_target").format(min_class_value, min_class_count)
|
|
123
122
|
logger.warning(msg)
|
|
124
123
|
print(msg)
|
|
125
124
|
if warning_counter:
|
|
126
125
|
warning_counter.increment()
|
|
127
126
|
|
|
128
|
-
# 25% and lower classes will stay as is. Higher classes will be downsampled
|
|
129
127
|
sample_strategy = dict()
|
|
130
|
-
for
|
|
131
|
-
|
|
132
|
-
|
|
128
|
+
for class_value in vc.index:
|
|
129
|
+
if class_value == min_class_value:
|
|
130
|
+
continue
|
|
133
131
|
class_count = vc[class_value]
|
|
134
|
-
|
|
132
|
+
sample_size = min(
|
|
133
|
+
class_count,
|
|
134
|
+
multiclass_bootstrap_loops
|
|
135
|
+
* (
|
|
136
|
+
min_class_count
|
|
137
|
+
+ max((multiclass_min_sample_threshold - num_classes * min_class_count) / (num_classes - 1), 0)
|
|
138
|
+
),
|
|
139
|
+
)
|
|
140
|
+
sample_strategy[class_value] = int(sample_size)
|
|
135
141
|
sampler = RandomUnderSampler(sampling_strategy=sample_strategy, random_state=random_state)
|
|
136
142
|
X = df[SYSTEM_RECORD_ID]
|
|
137
143
|
X = X.to_frame(SYSTEM_RECORD_ID)
|
|
138
144
|
new_x, _ = sampler.fit_resample(X, target) # type: ignore
|
|
139
145
|
|
|
140
146
|
resampled_data = df[df[SYSTEM_RECORD_ID].isin(new_x[SYSTEM_RECORD_ID])]
|
|
141
|
-
elif len(df) >
|
|
142
|
-
msg = bundle.get("dataset_rarest_class_less_threshold").format(
|
|
143
|
-
|
|
144
|
-
)
|
|
147
|
+
elif len(df) > binary_min_sample_threshold:
|
|
148
|
+
# msg = bundle.get("dataset_rarest_class_less_threshold").format(
|
|
149
|
+
# min_class_value, min_class_count, min_class_threshold, min_class_percent * 100
|
|
150
|
+
# )
|
|
151
|
+
msg = bundle.get("imbalanced_target").format(min_class_value, min_class_count)
|
|
145
152
|
logger.warning(msg)
|
|
146
153
|
print(msg)
|
|
147
154
|
if warning_counter:
|
|
@@ -153,7 +160,7 @@ def balance_undersample(
|
|
|
153
160
|
# sample_size = min(len(majority_class), min_sample_threshold - min_class_count)
|
|
154
161
|
sample_size = min(
|
|
155
162
|
max_class_count,
|
|
156
|
-
binary_bootstrap_loops * (min_class_count + max(
|
|
163
|
+
binary_bootstrap_loops * (min_class_count + max(binary_min_sample_threshold - 2 * min_class_count, 0)),
|
|
157
164
|
)
|
|
158
165
|
sampled_majority_class = majority_class.sample(n=sample_size, random_state=random_state)
|
|
159
166
|
resampled_data = df[
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
__version__ = "1.2.14a1"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|