upgini 1.2.14a3616.dev2__py3-none-any.whl → 1.2.15__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- upgini/__about__.py +1 -1
- upgini/dataset.py +6 -3
- upgini/features_enricher.py +19 -21
- upgini/normalizer/normalize_utils.py +22 -15
- upgini/resource_bundle/strings.properties +8 -1
- upgini/utils/target_utils.py +96 -46
- {upgini-1.2.14a3616.dev2.dist-info → upgini-1.2.15.dist-info}/METADATA +1 -1
- {upgini-1.2.14a3616.dev2.dist-info → upgini-1.2.15.dist-info}/RECORD +10 -10
- {upgini-1.2.14a3616.dev2.dist-info → upgini-1.2.15.dist-info}/WHEEL +1 -1
- {upgini-1.2.14a3616.dev2.dist-info → upgini-1.2.15.dist-info}/licenses/LICENSE +0 -0
upgini/__about__.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "1.2.
|
|
1
|
+
__version__ = "1.2.15"
|
upgini/dataset.py
CHANGED
|
@@ -53,7 +53,8 @@ class Dataset: # (pd.DataFrame):
|
|
|
53
53
|
FIT_SAMPLE_THRESHOLD = 200_000
|
|
54
54
|
FIT_SAMPLE_WITH_EVAL_SET_ROWS = 200_000
|
|
55
55
|
FIT_SAMPLE_WITH_EVAL_SET_THRESHOLD = 200_000
|
|
56
|
-
|
|
56
|
+
BINARY_MIN_SAMPLE_THRESHOLD = 5_000
|
|
57
|
+
MULTICLASS_MIN_SAMPLE_THRESHOLD = 25_000
|
|
57
58
|
IMBALANCE_THESHOLD = 0.6
|
|
58
59
|
BINARY_BOOTSTRAP_LOOPS = 5
|
|
59
60
|
MULTICLASS_BOOTSTRAP_LOOPS = 2
|
|
@@ -225,7 +226,7 @@ class Dataset: # (pd.DataFrame):
|
|
|
225
226
|
train_segment = self.data
|
|
226
227
|
|
|
227
228
|
if self.task_type == ModelTaskType.MULTICLASS or (
|
|
228
|
-
self.task_type == ModelTaskType.BINARY and len(train_segment) > self.
|
|
229
|
+
self.task_type == ModelTaskType.BINARY and len(train_segment) > self.BINARY_MIN_SAMPLE_THRESHOLD
|
|
229
230
|
):
|
|
230
231
|
count = len(train_segment)
|
|
231
232
|
target_column = self.etalon_def_checked.get(FileColumnMeaningType.TARGET.value, TARGET)
|
|
@@ -253,6 +254,7 @@ class Dataset: # (pd.DataFrame):
|
|
|
253
254
|
min_class_percent = self.IMBALANCE_THESHOLD / target_classes_count
|
|
254
255
|
min_class_threshold = min_class_percent * count
|
|
255
256
|
|
|
257
|
+
# If min class count less than 30% for binary or (60 / classes_count)% for multiclass
|
|
256
258
|
if min_class_count < min_class_threshold:
|
|
257
259
|
self.imbalanced = True
|
|
258
260
|
self.data = balance_undersample(
|
|
@@ -260,7 +262,8 @@ class Dataset: # (pd.DataFrame):
|
|
|
260
262
|
target_column=target_column,
|
|
261
263
|
task_type=self.task_type,
|
|
262
264
|
random_state=self.random_state,
|
|
263
|
-
|
|
265
|
+
binary_min_sample_threshold=self.BINARY_MIN_SAMPLE_THRESHOLD,
|
|
266
|
+
multiclass_min_sample_threshold=self.MULTICLASS_MIN_SAMPLE_THRESHOLD,
|
|
264
267
|
binary_bootstrap_loops=self.BINARY_BOOTSTRAP_LOOPS,
|
|
265
268
|
multiclass_bootstrap_loops=self.MULTICLASS_BOOTSTRAP_LOOPS,
|
|
266
269
|
logger=self.logger,
|
upgini/features_enricher.py
CHANGED
|
@@ -1577,8 +1577,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1577
1577
|
df = generator.generate(df)
|
|
1578
1578
|
generated_features.extend(generator.generated_features)
|
|
1579
1579
|
|
|
1580
|
-
normalizer = Normalizer(
|
|
1581
|
-
df = normalizer.normalize(df)
|
|
1580
|
+
normalizer = Normalizer(self.bundle, self.logger, self.warning_counter)
|
|
1581
|
+
df, search_keys, generated_features = normalizer.normalize(df, search_keys, generated_features)
|
|
1582
1582
|
columns_renaming = normalizer.columns_renaming
|
|
1583
1583
|
|
|
1584
1584
|
df = clean_full_duplicates(df, logger=self.logger, silent=True, bundle=self.bundle)
|
|
@@ -2017,10 +2017,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2017
2017
|
df = generator.generate(df)
|
|
2018
2018
|
generated_features.extend(generator.generated_features)
|
|
2019
2019
|
|
|
2020
|
-
normalizer = Normalizer(
|
|
2021
|
-
|
|
2022
|
-
)
|
|
2023
|
-
df = normalizer.normalize(df)
|
|
2020
|
+
normalizer = Normalizer(self.bundle, self.logger, self.warning_counter, silent_mode)
|
|
2021
|
+
df, search_keys, generated_features = normalizer.normalize(df, search_keys, generated_features)
|
|
2024
2022
|
columns_renaming = normalizer.columns_renaming
|
|
2025
2023
|
|
|
2026
2024
|
# Don't pass all features in backend on transform
|
|
@@ -2449,14 +2447,13 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2449
2447
|
if is_numeric_dtype(df[self.TARGET_NAME]) and has_date:
|
|
2450
2448
|
self._validate_PSI(df.sort_values(by=maybe_date_column))
|
|
2451
2449
|
|
|
2452
|
-
self.
|
|
2453
|
-
|
|
2454
|
-
|
|
2455
|
-
self.fit_search_keys, self.fit_generated_features, self.bundle, self.logger, self.warning_counter
|
|
2450
|
+
normalizer = Normalizer(self.bundle, self.logger, self.warning_counter)
|
|
2451
|
+
df, self.fit_search_keys, self.fit_generated_features = normalizer.normalize(
|
|
2452
|
+
df, self.fit_search_keys, self.fit_generated_features
|
|
2456
2453
|
)
|
|
2457
|
-
|
|
2458
|
-
|
|
2459
|
-
self.
|
|
2454
|
+
self.fit_columns_renaming = normalizer.columns_renaming
|
|
2455
|
+
|
|
2456
|
+
self.__adjust_cv(df)
|
|
2460
2457
|
|
|
2461
2458
|
df = remove_fintech_duplicates(
|
|
2462
2459
|
df, self.fit_search_keys, date_format=self.date_format, logger=self.logger, bundle=self.bundle
|
|
@@ -2470,7 +2467,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2470
2467
|
self.df_with_original_index = df.copy()
|
|
2471
2468
|
# TODO check maybe need to drop _time column from df_with_original_index
|
|
2472
2469
|
|
|
2473
|
-
df, unnest_search_keys = self._explode_multiple_search_keys(df, self.fit_search_keys,
|
|
2470
|
+
df, unnest_search_keys = self._explode_multiple_search_keys(df, self.fit_search_keys, self.fit_columns_renaming)
|
|
2474
2471
|
|
|
2475
2472
|
# Convert EMAIL to HEM after unnesting to do it only with one column
|
|
2476
2473
|
email_column = self._get_email_column(self.fit_search_keys)
|
|
@@ -2480,7 +2477,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2480
2477
|
email_column,
|
|
2481
2478
|
hem_column,
|
|
2482
2479
|
self.fit_search_keys,
|
|
2483
|
-
|
|
2480
|
+
self.fit_columns_renaming,
|
|
2484
2481
|
list(unnest_search_keys.keys()),
|
|
2485
2482
|
self.logger,
|
|
2486
2483
|
)
|
|
@@ -2491,7 +2488,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2491
2488
|
converter = IpSearchKeyConverter(
|
|
2492
2489
|
ip_column,
|
|
2493
2490
|
self.fit_search_keys,
|
|
2494
|
-
|
|
2491
|
+
self.fit_columns_renaming,
|
|
2495
2492
|
list(unnest_search_keys.keys()),
|
|
2496
2493
|
self.bundle,
|
|
2497
2494
|
self.logger,
|
|
@@ -2522,7 +2519,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2522
2519
|
features_columns = [c for c in df.columns if c not in non_feature_columns]
|
|
2523
2520
|
|
|
2524
2521
|
features_to_drop = FeaturesValidator(self.logger).validate(
|
|
2525
|
-
df, features_columns, self.generate_features, self.warning_counter,
|
|
2522
|
+
df, features_columns, self.generate_features, self.warning_counter, self.fit_columns_renaming
|
|
2526
2523
|
)
|
|
2527
2524
|
self.fit_dropped_features.update(features_to_drop)
|
|
2528
2525
|
df = df.drop(columns=features_to_drop)
|
|
@@ -2563,7 +2560,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2563
2560
|
rest_client=self.rest_client,
|
|
2564
2561
|
logger=self.logger,
|
|
2565
2562
|
)
|
|
2566
|
-
dataset.columns_renaming =
|
|
2563
|
+
dataset.columns_renaming = self.fit_columns_renaming
|
|
2567
2564
|
|
|
2568
2565
|
self.passed_features = [
|
|
2569
2566
|
column for column, meaning_type in meaning_types.items() if meaning_type == FileColumnMeaningType.FEATURE
|
|
@@ -2710,18 +2707,19 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2710
2707
|
if not self.warning_counter.has_warnings():
|
|
2711
2708
|
self.__display_support_link(self.bundle.get("all_ok_community_invite"))
|
|
2712
2709
|
|
|
2713
|
-
def __adjust_cv(self, df: pd.DataFrame
|
|
2710
|
+
def __adjust_cv(self, df: pd.DataFrame):
|
|
2711
|
+
date_column = SearchKey.find_key(self.fit_search_keys, [SearchKey.DATE, SearchKey.DATETIME])
|
|
2714
2712
|
# Check Multivariate time series
|
|
2715
2713
|
if (
|
|
2716
2714
|
self.cv is None
|
|
2717
2715
|
and date_column
|
|
2718
|
-
and model_task_type == ModelTaskType.REGRESSION
|
|
2716
|
+
and self.model_task_type == ModelTaskType.REGRESSION
|
|
2719
2717
|
and len({SearchKey.PHONE, SearchKey.EMAIL, SearchKey.HEM}.intersection(self.fit_search_keys.keys())) == 0
|
|
2720
2718
|
and is_blocked_time_series(df, date_column, list(self.fit_search_keys.keys()) + [TARGET])
|
|
2721
2719
|
):
|
|
2722
2720
|
msg = self.bundle.get("multivariate_timeseries_detected")
|
|
2723
2721
|
self.__override_cv(CVType.blocked_time_series, msg, print_warning=False)
|
|
2724
|
-
elif self.cv is None and model_task_type != ModelTaskType.REGRESSION:
|
|
2722
|
+
elif self.cv is None and self.model_task_type != ModelTaskType.REGRESSION:
|
|
2725
2723
|
msg = self.bundle.get("group_k_fold_in_classification")
|
|
2726
2724
|
self.__override_cv(CVType.group_k_fold, msg, print_warning=self.cv is not None)
|
|
2727
2725
|
group_columns = self._get_group_columns(df, self.fit_search_keys)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import hashlib
|
|
2
2
|
from logging import Logger, getLogger
|
|
3
|
-
from typing import Dict, List
|
|
3
|
+
from typing import Dict, List, Tuple
|
|
4
4
|
|
|
5
5
|
import numpy as np
|
|
6
6
|
import pandas as pd
|
|
@@ -35,22 +35,25 @@ class Normalizer:
|
|
|
35
35
|
|
|
36
36
|
def __init__(
|
|
37
37
|
self,
|
|
38
|
-
search_keys: Dict[str, SearchKey],
|
|
39
|
-
generated_features: List[str],
|
|
40
38
|
bundle: ResourceBundle = None,
|
|
41
39
|
logger: Logger = None,
|
|
42
40
|
warnings_counter: WarningCounter = None,
|
|
43
41
|
silent_mode=False,
|
|
44
42
|
):
|
|
45
|
-
self.search_keys = search_keys
|
|
46
|
-
self.generated_features = generated_features
|
|
47
43
|
self.bundle = bundle or get_custom_bundle()
|
|
48
44
|
self.logger = logger or getLogger()
|
|
49
45
|
self.warnings_counter = warnings_counter or WarningCounter()
|
|
50
46
|
self.silent_mode = silent_mode
|
|
51
47
|
self.columns_renaming = {}
|
|
48
|
+
self.search_keys = {}
|
|
49
|
+
self.generated_features = []
|
|
50
|
+
|
|
51
|
+
def normalize(
|
|
52
|
+
self, df: pd.DataFrame, search_keys: Dict[str, SearchKey], generated_features: List[str]
|
|
53
|
+
) -> Tuple[pd.DataFrame, Dict[str, SearchKey], List[str]]:
|
|
54
|
+
self.search_keys = search_keys.copy()
|
|
55
|
+
self.generated_features = generated_features.copy()
|
|
52
56
|
|
|
53
|
-
def normalize(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
54
57
|
df = df.copy()
|
|
55
58
|
df = self._rename_columns(df)
|
|
56
59
|
|
|
@@ -68,21 +71,25 @@ class Normalizer:
|
|
|
68
71
|
|
|
69
72
|
df = self.__convert_features_types(df)
|
|
70
73
|
|
|
71
|
-
return df
|
|
74
|
+
return df, self.search_keys, self.generated_features
|
|
72
75
|
|
|
73
76
|
def _rename_columns(self, df: pd.DataFrame):
|
|
74
77
|
# logger.info("Replace restricted symbols in column names")
|
|
75
78
|
new_columns = []
|
|
76
79
|
dup_counter = 0
|
|
77
80
|
for column in df.columns:
|
|
78
|
-
if
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
81
|
+
if (
|
|
82
|
+
column
|
|
83
|
+
in [
|
|
84
|
+
TARGET,
|
|
85
|
+
EVAL_SET_INDEX,
|
|
86
|
+
SYSTEM_RECORD_ID,
|
|
87
|
+
ENTITY_SYSTEM_RECORD_ID,
|
|
88
|
+
SEARCH_KEY_UNNEST,
|
|
89
|
+
DateTimeSearchKeyConverter.DATETIME_COL,
|
|
90
|
+
]
|
|
91
|
+
+ self.generated_features
|
|
92
|
+
):
|
|
86
93
|
self.columns_renaming[column] = column
|
|
87
94
|
new_columns.append(column)
|
|
88
95
|
continue
|
|
@@ -203,11 +203,18 @@ email_detected=Emails detected in column `{}`. It will be used as a search key\n
|
|
|
203
203
|
email_detected_not_registered=Emails detected in column `{}`. It can be used only with api_key from profile.upgini.com\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns
|
|
204
204
|
phone_detected=Phone numbers detected in column `{}`. It can be used only with api_key from profile.upgini.com\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns
|
|
205
205
|
phone_detected_not_registered=\nWARNING: Phone numbers detected in column `{}`. It can be used only with api_key from profile.upgini.com\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns
|
|
206
|
-
target_type_detected=\nDetected task type: {}\n
|
|
206
|
+
target_type_detected=\nDetected task type: {}. Reason: {}\nYou can set task type manually with argument `model_task_type` of FeaturesEnricher constructor if task type detected incorrectly\n
|
|
207
|
+
binary_target_reason=only two unique label-values observed
|
|
208
|
+
non_numeric_multiclass_reason=non-numeric label values observed
|
|
209
|
+
few_unique_label_multiclass_reason=few unique label-values observed and can be considered as categorical
|
|
210
|
+
date_search_key_regression_reason=date search key is present, treating as regression
|
|
211
|
+
many_unique_label_regression_reason=many unique label-values or non-integer floating point values observed
|
|
212
|
+
limited_int_multiclass_reason=integer-like values with limited unique values observed
|
|
207
213
|
# all_ok_community_invite=Chat with us in Slack community:
|
|
208
214
|
all_ok_community_invite=❓ Support request
|
|
209
215
|
too_small_for_metrics=Your train dataset or one of eval datasets contains less than 500 rows. For such dataset Upgini will not calculate accuracy metrics. Please increase the number of rows in the training dataset to calculate accuracy metrics
|
|
210
216
|
imbalance_multiclass=Class {0} is on 25% quantile of classes distribution ({1} records in train dataset). \nDownsample classes with records more than {1}.
|
|
217
|
+
imbalanced_target=\nWARNING: Target is imbalanced and will be undersampled. Frequency of the rarest class `{}` is {}
|
|
211
218
|
loss_selection_info=Using loss `{}` for feature selection
|
|
212
219
|
loss_calc_metrics_info=Using loss `{}` for metrics calculation with default estimator
|
|
213
220
|
|
upgini/utils/target_utils.py
CHANGED
|
@@ -24,49 +24,83 @@ def define_task(
|
|
|
24
24
|
) -> ModelTaskType:
|
|
25
25
|
if logger is None:
|
|
26
26
|
logger = logging.getLogger()
|
|
27
|
+
|
|
28
|
+
# Replace inf and -inf with NaN to handle extreme values correctly
|
|
29
|
+
y = y.replace([np.inf, -np.inf], np.nan, inplace=False)
|
|
30
|
+
|
|
31
|
+
# Drop NaN values from the target
|
|
27
32
|
target = y.dropna()
|
|
33
|
+
|
|
34
|
+
# Check if target is numeric and finite
|
|
28
35
|
if is_numeric_dtype(target):
|
|
29
36
|
target = target.loc[np.isfinite(target)]
|
|
30
37
|
else:
|
|
38
|
+
# If not numeric, drop empty strings as well
|
|
31
39
|
target = target.loc[target != ""]
|
|
40
|
+
|
|
41
|
+
# Raise error if there are no valid values left in the target
|
|
32
42
|
if len(target) == 0:
|
|
33
43
|
raise ValidationError(bundle.get("empty_target"))
|
|
44
|
+
|
|
45
|
+
# Count unique values in the target
|
|
34
46
|
target_items = target.nunique()
|
|
47
|
+
|
|
48
|
+
# Raise error if all target values are the same
|
|
35
49
|
if target_items == 1:
|
|
36
50
|
raise ValidationError(bundle.get("dataset_constant_target"))
|
|
51
|
+
|
|
52
|
+
reason = "" # Will store the reason for selecting the task type
|
|
53
|
+
|
|
54
|
+
# Binary classification case: exactly two unique values
|
|
37
55
|
if target_items == 2:
|
|
38
56
|
task = ModelTaskType.BINARY
|
|
57
|
+
reason = bundle.get("binary_target_reason")
|
|
39
58
|
else:
|
|
59
|
+
# Attempt to convert target to numeric
|
|
40
60
|
try:
|
|
41
61
|
target = pd.to_numeric(target)
|
|
42
62
|
is_numeric = True
|
|
43
63
|
except Exception:
|
|
44
64
|
is_numeric = False
|
|
45
65
|
|
|
46
|
-
# If
|
|
66
|
+
# If target cannot be converted to numeric, assume multiclass classification
|
|
47
67
|
if not is_numeric:
|
|
48
68
|
task = ModelTaskType.MULTICLASS
|
|
69
|
+
reason = bundle.get("non_numeric_multiclass_reason")
|
|
49
70
|
else:
|
|
71
|
+
# Multiclass classification: few unique values and integer encoding
|
|
50
72
|
if target.nunique() <= 50 and is_int_encoding(target.unique()):
|
|
51
73
|
task = ModelTaskType.MULTICLASS
|
|
74
|
+
reason = bundle.get("few_unique_label_multiclass_reason")
|
|
75
|
+
# Regression case: if there is date, assume regression
|
|
52
76
|
elif has_date:
|
|
53
77
|
task = ModelTaskType.REGRESSION
|
|
78
|
+
reason = bundle.get("date_search_key_regression_reason")
|
|
54
79
|
else:
|
|
80
|
+
# Remove zero values and recalculate unique ratio
|
|
55
81
|
non_zero_target = target[target != 0]
|
|
56
82
|
target_items = non_zero_target.nunique()
|
|
57
83
|
target_ratio = target_items / len(non_zero_target)
|
|
84
|
+
|
|
85
|
+
# Use unique_ratio to determine whether to classify as regression or multiclass
|
|
58
86
|
if (
|
|
59
|
-
(target.dtype.kind == "f" and np.any(target != target.astype(int))) #
|
|
87
|
+
(target.dtype.kind == "f" and np.any(target != target.astype(int))) # Non-integer float values
|
|
60
88
|
or target_items > 50
|
|
61
|
-
or target_ratio > 0.2
|
|
89
|
+
or target_ratio > 0.2 # If non-zero values have high ratio of uniqueness
|
|
62
90
|
):
|
|
63
91
|
task = ModelTaskType.REGRESSION
|
|
92
|
+
reason = bundle.get("many_unique_label_regression_reason")
|
|
64
93
|
else:
|
|
65
94
|
task = ModelTaskType.MULTICLASS
|
|
95
|
+
reason = bundle.get("limited_int_multiclass_reason")
|
|
66
96
|
|
|
67
|
-
|
|
97
|
+
# Log or print the reason for the selected task type
|
|
98
|
+
logger.info(f"Detected task type: {task} (Reason: {reason})")
|
|
99
|
+
|
|
100
|
+
# Print task type and reason if silent mode is off
|
|
68
101
|
if not silent:
|
|
69
|
-
print(bundle.get("target_type_detected").format(task))
|
|
102
|
+
print(bundle.get("target_type_detected").format(task, reason))
|
|
103
|
+
|
|
70
104
|
return task
|
|
71
105
|
|
|
72
106
|
|
|
@@ -81,8 +115,8 @@ def balance_undersample(
|
|
|
81
115
|
target_column: str,
|
|
82
116
|
task_type: ModelTaskType,
|
|
83
117
|
random_state: int,
|
|
84
|
-
|
|
85
|
-
|
|
118
|
+
binary_min_sample_threshold: int = 5000,
|
|
119
|
+
multiclass_min_sample_threshold: int = 25000,
|
|
86
120
|
binary_bootstrap_loops: int = 5,
|
|
87
121
|
multiclass_bootstrap_loops: int = 2,
|
|
88
122
|
logger: Optional[logging.Logger] = None,
|
|
@@ -96,52 +130,60 @@ def balance_undersample(
|
|
|
96
130
|
if SYSTEM_RECORD_ID not in df.columns:
|
|
97
131
|
raise Exception("System record id must be presented for undersampling")
|
|
98
132
|
|
|
99
|
-
count = len(df)
|
|
133
|
+
# count = len(df)
|
|
100
134
|
target = df[target_column].copy()
|
|
101
|
-
target_classes_count = target.nunique()
|
|
135
|
+
# target_classes_count = target.nunique()
|
|
102
136
|
|
|
103
137
|
vc = target.value_counts()
|
|
104
138
|
max_class_value = vc.index[0]
|
|
105
139
|
min_class_value = vc.index[len(vc) - 1]
|
|
106
140
|
max_class_count = vc[max_class_value]
|
|
107
141
|
min_class_count = vc[min_class_value]
|
|
142
|
+
num_classes = len(vc)
|
|
108
143
|
|
|
109
|
-
min_class_percent = imbalance_threshold / target_classes_count
|
|
110
|
-
min_class_threshold = int(min_class_percent * count)
|
|
144
|
+
# min_class_percent = imbalance_threshold / target_classes_count
|
|
145
|
+
# min_class_threshold = int(min_class_percent * count)
|
|
111
146
|
|
|
112
147
|
resampled_data = df
|
|
113
148
|
df = df.copy().sort_values(by=SYSTEM_RECORD_ID)
|
|
114
149
|
if task_type == ModelTaskType.MULTICLASS:
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
if max_class_count > (quantile25_class_cnt * multiclass_bootstrap_loops):
|
|
122
|
-
msg = bundle.get("imbalance_multiclass").format(quantile25_class, quantile25_class_cnt)
|
|
150
|
+
if len(df) > multiclass_min_sample_threshold and max_class_count > (
|
|
151
|
+
min_class_count * multiclass_bootstrap_loops
|
|
152
|
+
):
|
|
153
|
+
|
|
154
|
+
# msg = bundle.get("imbalance_multiclass").format(min_class_value, min_class_count)
|
|
155
|
+
msg = bundle.get("imbalanced_target").format(min_class_value, min_class_count)
|
|
123
156
|
logger.warning(msg)
|
|
124
157
|
print(msg)
|
|
125
158
|
if warning_counter:
|
|
126
159
|
warning_counter.increment()
|
|
127
160
|
|
|
128
|
-
# 25% and lower classes will stay as is. Higher classes will be downsampled
|
|
129
161
|
sample_strategy = dict()
|
|
130
|
-
for
|
|
131
|
-
|
|
132
|
-
|
|
162
|
+
for class_value in vc.index:
|
|
163
|
+
if class_value == min_class_value:
|
|
164
|
+
continue
|
|
133
165
|
class_count = vc[class_value]
|
|
134
|
-
|
|
166
|
+
sample_size = min(
|
|
167
|
+
class_count,
|
|
168
|
+
multiclass_bootstrap_loops
|
|
169
|
+
* (
|
|
170
|
+
min_class_count
|
|
171
|
+
+ max((multiclass_min_sample_threshold - num_classes * min_class_count) / (num_classes - 1), 0)
|
|
172
|
+
),
|
|
173
|
+
)
|
|
174
|
+
sample_strategy[class_value] = int(sample_size)
|
|
175
|
+
logger.info(f"Rebalance sample strategy: {sample_strategy}. Min class count: {min_class_count}")
|
|
135
176
|
sampler = RandomUnderSampler(sampling_strategy=sample_strategy, random_state=random_state)
|
|
136
177
|
X = df[SYSTEM_RECORD_ID]
|
|
137
178
|
X = X.to_frame(SYSTEM_RECORD_ID)
|
|
138
179
|
new_x, _ = sampler.fit_resample(X, target) # type: ignore
|
|
139
180
|
|
|
140
181
|
resampled_data = df[df[SYSTEM_RECORD_ID].isin(new_x[SYSTEM_RECORD_ID])]
|
|
141
|
-
elif len(df) >
|
|
142
|
-
msg = bundle.get("dataset_rarest_class_less_threshold").format(
|
|
143
|
-
|
|
144
|
-
)
|
|
182
|
+
elif len(df) > binary_min_sample_threshold:
|
|
183
|
+
# msg = bundle.get("dataset_rarest_class_less_threshold").format(
|
|
184
|
+
# min_class_value, min_class_count, min_class_threshold, min_class_percent * 100
|
|
185
|
+
# )
|
|
186
|
+
msg = bundle.get("imbalanced_target").format(min_class_value, min_class_count)
|
|
145
187
|
logger.warning(msg)
|
|
146
188
|
print(msg)
|
|
147
189
|
if warning_counter:
|
|
@@ -150,30 +192,38 @@ def balance_undersample(
|
|
|
150
192
|
# fill up to min_sample_threshold by majority class
|
|
151
193
|
minority_class = df[df[target_column] == min_class_value]
|
|
152
194
|
majority_class = df[df[target_column] != min_class_value]
|
|
153
|
-
sample_size = min(len(majority_class), min_sample_threshold - min_class_count)
|
|
195
|
+
# sample_size = min(len(majority_class), min_sample_threshold - min_class_count)
|
|
196
|
+
sample_size = min(
|
|
197
|
+
max_class_count,
|
|
198
|
+
binary_bootstrap_loops * (min_class_count + max(binary_min_sample_threshold - 2 * min_class_count, 0)),
|
|
199
|
+
)
|
|
200
|
+
logger.info(
|
|
201
|
+
f"Min class count: {min_class_count}. Max class count: {max_class_count}."
|
|
202
|
+
f" Rebalance sample size: {sample_size}"
|
|
203
|
+
)
|
|
154
204
|
sampled_majority_class = majority_class.sample(n=sample_size, random_state=random_state)
|
|
155
205
|
resampled_data = df[
|
|
156
206
|
(df[SYSTEM_RECORD_ID].isin(minority_class[SYSTEM_RECORD_ID]))
|
|
157
207
|
| (df[SYSTEM_RECORD_ID].isin(sampled_majority_class[SYSTEM_RECORD_ID]))
|
|
158
208
|
]
|
|
159
209
|
|
|
160
|
-
elif max_class_count > min_class_count * binary_bootstrap_loops:
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
210
|
+
# elif max_class_count > min_class_count * binary_bootstrap_loops:
|
|
211
|
+
# msg = bundle.get("dataset_rarest_class_less_threshold").format(
|
|
212
|
+
# min_class_value, min_class_count, min_class_threshold, min_class_percent * 100
|
|
213
|
+
# )
|
|
214
|
+
# logger.warning(msg)
|
|
215
|
+
# print(msg)
|
|
216
|
+
# if warning_counter:
|
|
217
|
+
# warning_counter.increment()
|
|
218
|
+
|
|
219
|
+
# sampler = RandomUnderSampler(
|
|
220
|
+
# sampling_strategy={max_class_value: binary_bootstrap_loops * min_class_count}, random_state=random_state
|
|
221
|
+
# )
|
|
222
|
+
# X = df[SYSTEM_RECORD_ID]
|
|
223
|
+
# X = X.to_frame(SYSTEM_RECORD_ID)
|
|
224
|
+
# new_x, _ = sampler.fit_resample(X, target) # type: ignore
|
|
225
|
+
|
|
226
|
+
# resampled_data = df[df[SYSTEM_RECORD_ID].isin(new_x[SYSTEM_RECORD_ID])]
|
|
177
227
|
|
|
178
228
|
logger.info(f"Shape after rebalance resampling: {resampled_data}")
|
|
179
229
|
return resampled_data
|
|
@@ -1,9 +1,9 @@
|
|
|
1
|
-
upgini/__about__.py,sha256=
|
|
1
|
+
upgini/__about__.py,sha256=Q6rDLuL8XHKQggYBtRCtxzpPQJgFYWn4x0gcVlH7H4g,23
|
|
2
2
|
upgini/__init__.py,sha256=M64LwQTBa-5Jz24Zm2h8rWwlKQQ1J8nP7gGgIciS0WU,589
|
|
3
3
|
upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
|
|
4
|
-
upgini/dataset.py,sha256=
|
|
4
|
+
upgini/dataset.py,sha256=iPFiMJtk4HF1ytw9wCQr8H9RfoOKj_TIo8XYZKWgcMc,31331
|
|
5
5
|
upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
|
|
6
|
-
upgini/features_enricher.py,sha256=
|
|
6
|
+
upgini/features_enricher.py,sha256=vRC7g6n6XQxSrvzXk6NJjP0ZytDQhWR4sTAo4Hp7gmA,188319
|
|
7
7
|
upgini/http.py,sha256=21asexflvavydzCOONJDGQBtQanCElrbnqLXakJ9Cu8,42880
|
|
8
8
|
upgini/lazy_import.py,sha256=74gQ8JuA48BGRLxAo7lNHNKY2D2emMxrUxKGdxVGhuY,1012
|
|
9
9
|
upgini/metadata.py,sha256=osmzdNESeh7yP3BZday6N9Q3eaIHfzhhRM1d6NSgcf0,11223
|
|
@@ -27,10 +27,10 @@ upgini/data_source/data_source_publisher.py,sha256=X-8aGtVgzGmxyXkMVBoBLIGDMb4lY
|
|
|
27
27
|
upgini/mdc/__init__.py,sha256=aM08nIWFc2gWdWUa3_IuEnNND0cQPkBGnYpRMnfFN8k,1019
|
|
28
28
|
upgini/mdc/context.py,sha256=3u1B-jXt7tXEvNcV3qmR9SDCseudnY7KYsLclBdwVLk,1405
|
|
29
29
|
upgini/normalizer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
30
|
-
upgini/normalizer/normalize_utils.py,sha256=
|
|
30
|
+
upgini/normalizer/normalize_utils.py,sha256=Lv75lq7M46z9cAIutwkdKZtPZkWblgoRzToAJ1BwY8A,7709
|
|
31
31
|
upgini/resource_bundle/__init__.py,sha256=S5F2G47pnJd2LDpmFsjDqEwiKkP8Hm-hcseDbMka6Ko,8345
|
|
32
32
|
upgini/resource_bundle/exceptions.py,sha256=5fRvx0_vWdE1-7HcSgF0tckB4A9AKyf5RiinZkInTsI,621
|
|
33
|
-
upgini/resource_bundle/strings.properties,sha256=
|
|
33
|
+
upgini/resource_bundle/strings.properties,sha256=eqJP6bGu12zFuQJqMY03QbMhppcdwIfL2bsJWaqmuZ4,27221
|
|
34
34
|
upgini/resource_bundle/strings_widget.properties,sha256=gOdqvZWntP2LCza_tyVk1_yRYcG4c04K9sQOAVhF_gw,1577
|
|
35
35
|
upgini/sampler/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
36
36
|
upgini/sampler/base.py,sha256=7GpjYqjOp58vYcJLiX__1R5wjUlyQbxvHJ2klFnup_M,6389
|
|
@@ -54,10 +54,10 @@ upgini/utils/phone_utils.py,sha256=IrbztLuOJBiePqqxllfABWfYlfAjYevPhXKipl95wUI,1
|
|
|
54
54
|
upgini/utils/postal_code_utils.py,sha256=5M0sUqH2DAr33kARWCTXR-ACyzWbjDq_-0mmEml6ZcU,1716
|
|
55
55
|
upgini/utils/progress_bar.py,sha256=N-Sfdah2Hg8lXP_fV9EfUTXz_PyRt4lo9fAHoUDOoLc,1550
|
|
56
56
|
upgini/utils/sklearn_ext.py,sha256=13jQS_k7v0aUtudXV6nGUEWjttPQzAW9AFYL5wgEz9k,44511
|
|
57
|
-
upgini/utils/target_utils.py,sha256=
|
|
57
|
+
upgini/utils/target_utils.py,sha256=qHzZRmICFbLNCrmVqGkaBcjm91L2ERRZMppci36acV4,10085
|
|
58
58
|
upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
|
|
59
59
|
upgini/utils/warning_counter.py,sha256=dIWBB4dI5XRRJZudvIlqlIYKEiwLLPcXarsZuYRt338,227
|
|
60
|
-
upgini-1.2.
|
|
61
|
-
upgini-1.2.
|
|
62
|
-
upgini-1.2.
|
|
63
|
-
upgini-1.2.
|
|
60
|
+
upgini-1.2.15.dist-info/METADATA,sha256=Hua2FUNftyzzpi9eR090MFJ-5F8S_KS_5SrZhwOUgco,48577
|
|
61
|
+
upgini-1.2.15.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
|
|
62
|
+
upgini-1.2.15.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
|
|
63
|
+
upgini-1.2.15.dist-info/RECORD,,
|
|
File without changes
|