upgini 1.2.14a3616.dev3__py3-none-any.whl → 1.2.15__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- upgini/__about__.py +1 -1
- upgini/dataset.py +6 -3
- upgini/features_enricher.py +21 -26
- upgini/normalizer/normalize_utils.py +22 -15
- upgini/resource_bundle/strings.properties +8 -1
- upgini/utils/target_utils.py +96 -46
- {upgini-1.2.14a3616.dev3.dist-info → upgini-1.2.15.dist-info}/METADATA +1 -1
- {upgini-1.2.14a3616.dev3.dist-info → upgini-1.2.15.dist-info}/RECORD +10 -10
- {upgini-1.2.14a3616.dev3.dist-info → upgini-1.2.15.dist-info}/WHEEL +1 -1
- {upgini-1.2.14a3616.dev3.dist-info → upgini-1.2.15.dist-info}/licenses/LICENSE +0 -0
upgini/__about__.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "1.2.
|
|
1
|
+
__version__ = "1.2.15"
|
upgini/dataset.py
CHANGED
|
@@ -53,7 +53,8 @@ class Dataset: # (pd.DataFrame):
|
|
|
53
53
|
FIT_SAMPLE_THRESHOLD = 200_000
|
|
54
54
|
FIT_SAMPLE_WITH_EVAL_SET_ROWS = 200_000
|
|
55
55
|
FIT_SAMPLE_WITH_EVAL_SET_THRESHOLD = 200_000
|
|
56
|
-
|
|
56
|
+
BINARY_MIN_SAMPLE_THRESHOLD = 5_000
|
|
57
|
+
MULTICLASS_MIN_SAMPLE_THRESHOLD = 25_000
|
|
57
58
|
IMBALANCE_THESHOLD = 0.6
|
|
58
59
|
BINARY_BOOTSTRAP_LOOPS = 5
|
|
59
60
|
MULTICLASS_BOOTSTRAP_LOOPS = 2
|
|
@@ -225,7 +226,7 @@ class Dataset: # (pd.DataFrame):
|
|
|
225
226
|
train_segment = self.data
|
|
226
227
|
|
|
227
228
|
if self.task_type == ModelTaskType.MULTICLASS or (
|
|
228
|
-
self.task_type == ModelTaskType.BINARY and len(train_segment) > self.
|
|
229
|
+
self.task_type == ModelTaskType.BINARY and len(train_segment) > self.BINARY_MIN_SAMPLE_THRESHOLD
|
|
229
230
|
):
|
|
230
231
|
count = len(train_segment)
|
|
231
232
|
target_column = self.etalon_def_checked.get(FileColumnMeaningType.TARGET.value, TARGET)
|
|
@@ -253,6 +254,7 @@ class Dataset: # (pd.DataFrame):
|
|
|
253
254
|
min_class_percent = self.IMBALANCE_THESHOLD / target_classes_count
|
|
254
255
|
min_class_threshold = min_class_percent * count
|
|
255
256
|
|
|
257
|
+
# If min class count less than 30% for binary or (60 / classes_count)% for multiclass
|
|
256
258
|
if min_class_count < min_class_threshold:
|
|
257
259
|
self.imbalanced = True
|
|
258
260
|
self.data = balance_undersample(
|
|
@@ -260,7 +262,8 @@ class Dataset: # (pd.DataFrame):
|
|
|
260
262
|
target_column=target_column,
|
|
261
263
|
task_type=self.task_type,
|
|
262
264
|
random_state=self.random_state,
|
|
263
|
-
|
|
265
|
+
binary_min_sample_threshold=self.BINARY_MIN_SAMPLE_THRESHOLD,
|
|
266
|
+
multiclass_min_sample_threshold=self.MULTICLASS_MIN_SAMPLE_THRESHOLD,
|
|
264
267
|
binary_bootstrap_loops=self.BINARY_BOOTSTRAP_LOOPS,
|
|
265
268
|
multiclass_bootstrap_loops=self.MULTICLASS_BOOTSTRAP_LOOPS,
|
|
266
269
|
logger=self.logger,
|
upgini/features_enricher.py
CHANGED
|
@@ -1577,8 +1577,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1577
1577
|
df = generator.generate(df)
|
|
1578
1578
|
generated_features.extend(generator.generated_features)
|
|
1579
1579
|
|
|
1580
|
-
normalizer = Normalizer(
|
|
1581
|
-
df = normalizer.normalize(df)
|
|
1580
|
+
normalizer = Normalizer(self.bundle, self.logger, self.warning_counter)
|
|
1581
|
+
df, search_keys, generated_features = normalizer.normalize(df, search_keys, generated_features)
|
|
1582
1582
|
columns_renaming = normalizer.columns_renaming
|
|
1583
1583
|
|
|
1584
1584
|
df = clean_full_duplicates(df, logger=self.logger, silent=True, bundle=self.bundle)
|
|
@@ -2017,10 +2017,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2017
2017
|
df = generator.generate(df)
|
|
2018
2018
|
generated_features.extend(generator.generated_features)
|
|
2019
2019
|
|
|
2020
|
-
normalizer = Normalizer(
|
|
2021
|
-
|
|
2022
|
-
)
|
|
2023
|
-
df = normalizer.normalize(df)
|
|
2020
|
+
normalizer = Normalizer(self.bundle, self.logger, self.warning_counter, silent_mode)
|
|
2021
|
+
df, search_keys, generated_features = normalizer.normalize(df, search_keys, generated_features)
|
|
2024
2022
|
columns_renaming = normalizer.columns_renaming
|
|
2025
2023
|
|
|
2026
2024
|
# Don't pass all features in backend on transform
|
|
@@ -2449,16 +2447,13 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2449
2447
|
if is_numeric_dtype(df[self.TARGET_NAME]) and has_date:
|
|
2450
2448
|
self._validate_PSI(df.sort_values(by=maybe_date_column))
|
|
2451
2449
|
|
|
2452
|
-
normalizer = Normalizer(
|
|
2453
|
-
|
|
2450
|
+
normalizer = Normalizer(self.bundle, self.logger, self.warning_counter)
|
|
2451
|
+
df, self.fit_search_keys, self.fit_generated_features = normalizer.normalize(
|
|
2452
|
+
df, self.fit_search_keys, self.fit_generated_features
|
|
2454
2453
|
)
|
|
2455
|
-
|
|
2456
|
-
columns_renaming = normalizer.columns_renaming
|
|
2457
|
-
self.fit_columns_renaming = columns_renaming
|
|
2454
|
+
self.fit_columns_renaming = normalizer.columns_renaming
|
|
2458
2455
|
|
|
2459
|
-
self.__adjust_cv(
|
|
2460
|
-
df, normalizer.search_keys, self.model_task_type
|
|
2461
|
-
)
|
|
2456
|
+
self.__adjust_cv(df)
|
|
2462
2457
|
|
|
2463
2458
|
df = remove_fintech_duplicates(
|
|
2464
2459
|
df, self.fit_search_keys, date_format=self.date_format, logger=self.logger, bundle=self.bundle
|
|
@@ -2472,7 +2467,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2472
2467
|
self.df_with_original_index = df.copy()
|
|
2473
2468
|
# TODO check maybe need to drop _time column from df_with_original_index
|
|
2474
2469
|
|
|
2475
|
-
df, unnest_search_keys = self._explode_multiple_search_keys(df, self.fit_search_keys,
|
|
2470
|
+
df, unnest_search_keys = self._explode_multiple_search_keys(df, self.fit_search_keys, self.fit_columns_renaming)
|
|
2476
2471
|
|
|
2477
2472
|
# Convert EMAIL to HEM after unnesting to do it only with one column
|
|
2478
2473
|
email_column = self._get_email_column(self.fit_search_keys)
|
|
@@ -2482,7 +2477,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2482
2477
|
email_column,
|
|
2483
2478
|
hem_column,
|
|
2484
2479
|
self.fit_search_keys,
|
|
2485
|
-
|
|
2480
|
+
self.fit_columns_renaming,
|
|
2486
2481
|
list(unnest_search_keys.keys()),
|
|
2487
2482
|
self.logger,
|
|
2488
2483
|
)
|
|
@@ -2493,7 +2488,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2493
2488
|
converter = IpSearchKeyConverter(
|
|
2494
2489
|
ip_column,
|
|
2495
2490
|
self.fit_search_keys,
|
|
2496
|
-
|
|
2491
|
+
self.fit_columns_renaming,
|
|
2497
2492
|
list(unnest_search_keys.keys()),
|
|
2498
2493
|
self.bundle,
|
|
2499
2494
|
self.logger,
|
|
@@ -2524,7 +2519,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2524
2519
|
features_columns = [c for c in df.columns if c not in non_feature_columns]
|
|
2525
2520
|
|
|
2526
2521
|
features_to_drop = FeaturesValidator(self.logger).validate(
|
|
2527
|
-
df, features_columns, self.generate_features, self.warning_counter,
|
|
2522
|
+
df, features_columns, self.generate_features, self.warning_counter, self.fit_columns_renaming
|
|
2528
2523
|
)
|
|
2529
2524
|
self.fit_dropped_features.update(features_to_drop)
|
|
2530
2525
|
df = df.drop(columns=features_to_drop)
|
|
@@ -2565,7 +2560,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2565
2560
|
rest_client=self.rest_client,
|
|
2566
2561
|
logger=self.logger,
|
|
2567
2562
|
)
|
|
2568
|
-
dataset.columns_renaming =
|
|
2563
|
+
dataset.columns_renaming = self.fit_columns_renaming
|
|
2569
2564
|
|
|
2570
2565
|
self.passed_features = [
|
|
2571
2566
|
column for column, meaning_type in meaning_types.items() if meaning_type == FileColumnMeaningType.FEATURE
|
|
@@ -2712,22 +2707,22 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2712
2707
|
if not self.warning_counter.has_warnings():
|
|
2713
2708
|
self.__display_support_link(self.bundle.get("all_ok_community_invite"))
|
|
2714
2709
|
|
|
2715
|
-
def __adjust_cv(self, df: pd.DataFrame
|
|
2716
|
-
date_column = SearchKey.find_key(
|
|
2710
|
+
def __adjust_cv(self, df: pd.DataFrame):
|
|
2711
|
+
date_column = SearchKey.find_key(self.fit_search_keys, [SearchKey.DATE, SearchKey.DATETIME])
|
|
2717
2712
|
# Check Multivariate time series
|
|
2718
2713
|
if (
|
|
2719
2714
|
self.cv is None
|
|
2720
2715
|
and date_column
|
|
2721
|
-
and model_task_type == ModelTaskType.REGRESSION
|
|
2722
|
-
and len({SearchKey.PHONE, SearchKey.EMAIL, SearchKey.HEM}.intersection(
|
|
2723
|
-
and is_blocked_time_series(df, date_column, list(
|
|
2716
|
+
and self.model_task_type == ModelTaskType.REGRESSION
|
|
2717
|
+
and len({SearchKey.PHONE, SearchKey.EMAIL, SearchKey.HEM}.intersection(self.fit_search_keys.keys())) == 0
|
|
2718
|
+
and is_blocked_time_series(df, date_column, list(self.fit_search_keys.keys()) + [TARGET])
|
|
2724
2719
|
):
|
|
2725
2720
|
msg = self.bundle.get("multivariate_timeseries_detected")
|
|
2726
2721
|
self.__override_cv(CVType.blocked_time_series, msg, print_warning=False)
|
|
2727
|
-
elif self.cv is None and model_task_type != ModelTaskType.REGRESSION:
|
|
2722
|
+
elif self.cv is None and self.model_task_type != ModelTaskType.REGRESSION:
|
|
2728
2723
|
msg = self.bundle.get("group_k_fold_in_classification")
|
|
2729
2724
|
self.__override_cv(CVType.group_k_fold, msg, print_warning=self.cv is not None)
|
|
2730
|
-
group_columns = self._get_group_columns(df,
|
|
2725
|
+
group_columns = self._get_group_columns(df, self.fit_search_keys)
|
|
2731
2726
|
self.runtime_parameters.properties["cv_params.group_columns"] = ",".join(group_columns)
|
|
2732
2727
|
self.runtime_parameters.properties["cv_params.shuffle_kfold"] = "True"
|
|
2733
2728
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import hashlib
|
|
2
2
|
from logging import Logger, getLogger
|
|
3
|
-
from typing import Dict, List
|
|
3
|
+
from typing import Dict, List, Tuple
|
|
4
4
|
|
|
5
5
|
import numpy as np
|
|
6
6
|
import pandas as pd
|
|
@@ -35,22 +35,25 @@ class Normalizer:
|
|
|
35
35
|
|
|
36
36
|
def __init__(
|
|
37
37
|
self,
|
|
38
|
-
search_keys: Dict[str, SearchKey],
|
|
39
|
-
generated_features: List[str],
|
|
40
38
|
bundle: ResourceBundle = None,
|
|
41
39
|
logger: Logger = None,
|
|
42
40
|
warnings_counter: WarningCounter = None,
|
|
43
41
|
silent_mode=False,
|
|
44
42
|
):
|
|
45
|
-
self.search_keys = search_keys
|
|
46
|
-
self.generated_features = generated_features
|
|
47
43
|
self.bundle = bundle or get_custom_bundle()
|
|
48
44
|
self.logger = logger or getLogger()
|
|
49
45
|
self.warnings_counter = warnings_counter or WarningCounter()
|
|
50
46
|
self.silent_mode = silent_mode
|
|
51
47
|
self.columns_renaming = {}
|
|
48
|
+
self.search_keys = {}
|
|
49
|
+
self.generated_features = []
|
|
50
|
+
|
|
51
|
+
def normalize(
|
|
52
|
+
self, df: pd.DataFrame, search_keys: Dict[str, SearchKey], generated_features: List[str]
|
|
53
|
+
) -> Tuple[pd.DataFrame, Dict[str, SearchKey], List[str]]:
|
|
54
|
+
self.search_keys = search_keys.copy()
|
|
55
|
+
self.generated_features = generated_features.copy()
|
|
52
56
|
|
|
53
|
-
def normalize(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
54
57
|
df = df.copy()
|
|
55
58
|
df = self._rename_columns(df)
|
|
56
59
|
|
|
@@ -68,21 +71,25 @@ class Normalizer:
|
|
|
68
71
|
|
|
69
72
|
df = self.__convert_features_types(df)
|
|
70
73
|
|
|
71
|
-
return df
|
|
74
|
+
return df, self.search_keys, self.generated_features
|
|
72
75
|
|
|
73
76
|
def _rename_columns(self, df: pd.DataFrame):
|
|
74
77
|
# logger.info("Replace restricted symbols in column names")
|
|
75
78
|
new_columns = []
|
|
76
79
|
dup_counter = 0
|
|
77
80
|
for column in df.columns:
|
|
78
|
-
if
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
81
|
+
if (
|
|
82
|
+
column
|
|
83
|
+
in [
|
|
84
|
+
TARGET,
|
|
85
|
+
EVAL_SET_INDEX,
|
|
86
|
+
SYSTEM_RECORD_ID,
|
|
87
|
+
ENTITY_SYSTEM_RECORD_ID,
|
|
88
|
+
SEARCH_KEY_UNNEST,
|
|
89
|
+
DateTimeSearchKeyConverter.DATETIME_COL,
|
|
90
|
+
]
|
|
91
|
+
+ self.generated_features
|
|
92
|
+
):
|
|
86
93
|
self.columns_renaming[column] = column
|
|
87
94
|
new_columns.append(column)
|
|
88
95
|
continue
|
|
@@ -203,11 +203,18 @@ email_detected=Emails detected in column `{}`. It will be used as a search key\n
|
|
|
203
203
|
email_detected_not_registered=Emails detected in column `{}`. It can be used only with api_key from profile.upgini.com\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns
|
|
204
204
|
phone_detected=Phone numbers detected in column `{}`. It can be used only with api_key from profile.upgini.com\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns
|
|
205
205
|
phone_detected_not_registered=\nWARNING: Phone numbers detected in column `{}`. It can be used only with api_key from profile.upgini.com\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns
|
|
206
|
-
target_type_detected=\nDetected task type: {}\n
|
|
206
|
+
target_type_detected=\nDetected task type: {}. Reason: {}\nYou can set task type manually with argument `model_task_type` of FeaturesEnricher constructor if task type detected incorrectly\n
|
|
207
|
+
binary_target_reason=only two unique label-values observed
|
|
208
|
+
non_numeric_multiclass_reason=non-numeric label values observed
|
|
209
|
+
few_unique_label_multiclass_reason=few unique label-values observed and can be considered as categorical
|
|
210
|
+
date_search_key_regression_reason=date search key is present, treating as regression
|
|
211
|
+
many_unique_label_regression_reason=many unique label-values or non-integer floating point values observed
|
|
212
|
+
limited_int_multiclass_reason=integer-like values with limited unique values observed
|
|
207
213
|
# all_ok_community_invite=Chat with us in Slack community:
|
|
208
214
|
all_ok_community_invite=❓ Support request
|
|
209
215
|
too_small_for_metrics=Your train dataset or one of eval datasets contains less than 500 rows. For such dataset Upgini will not calculate accuracy metrics. Please increase the number of rows in the training dataset to calculate accuracy metrics
|
|
210
216
|
imbalance_multiclass=Class {0} is on 25% quantile of classes distribution ({1} records in train dataset). \nDownsample classes with records more than {1}.
|
|
217
|
+
imbalanced_target=\nWARNING: Target is imbalanced and will be undersampled. Frequency of the rarest class `{}` is {}
|
|
211
218
|
loss_selection_info=Using loss `{}` for feature selection
|
|
212
219
|
loss_calc_metrics_info=Using loss `{}` for metrics calculation with default estimator
|
|
213
220
|
|
upgini/utils/target_utils.py
CHANGED
|
@@ -24,49 +24,83 @@ def define_task(
|
|
|
24
24
|
) -> ModelTaskType:
|
|
25
25
|
if logger is None:
|
|
26
26
|
logger = logging.getLogger()
|
|
27
|
+
|
|
28
|
+
# Replace inf and -inf with NaN to handle extreme values correctly
|
|
29
|
+
y = y.replace([np.inf, -np.inf], np.nan, inplace=False)
|
|
30
|
+
|
|
31
|
+
# Drop NaN values from the target
|
|
27
32
|
target = y.dropna()
|
|
33
|
+
|
|
34
|
+
# Check if target is numeric and finite
|
|
28
35
|
if is_numeric_dtype(target):
|
|
29
36
|
target = target.loc[np.isfinite(target)]
|
|
30
37
|
else:
|
|
38
|
+
# If not numeric, drop empty strings as well
|
|
31
39
|
target = target.loc[target != ""]
|
|
40
|
+
|
|
41
|
+
# Raise error if there are no valid values left in the target
|
|
32
42
|
if len(target) == 0:
|
|
33
43
|
raise ValidationError(bundle.get("empty_target"))
|
|
44
|
+
|
|
45
|
+
# Count unique values in the target
|
|
34
46
|
target_items = target.nunique()
|
|
47
|
+
|
|
48
|
+
# Raise error if all target values are the same
|
|
35
49
|
if target_items == 1:
|
|
36
50
|
raise ValidationError(bundle.get("dataset_constant_target"))
|
|
51
|
+
|
|
52
|
+
reason = "" # Will store the reason for selecting the task type
|
|
53
|
+
|
|
54
|
+
# Binary classification case: exactly two unique values
|
|
37
55
|
if target_items == 2:
|
|
38
56
|
task = ModelTaskType.BINARY
|
|
57
|
+
reason = bundle.get("binary_target_reason")
|
|
39
58
|
else:
|
|
59
|
+
# Attempt to convert target to numeric
|
|
40
60
|
try:
|
|
41
61
|
target = pd.to_numeric(target)
|
|
42
62
|
is_numeric = True
|
|
43
63
|
except Exception:
|
|
44
64
|
is_numeric = False
|
|
45
65
|
|
|
46
|
-
# If
|
|
66
|
+
# If target cannot be converted to numeric, assume multiclass classification
|
|
47
67
|
if not is_numeric:
|
|
48
68
|
task = ModelTaskType.MULTICLASS
|
|
69
|
+
reason = bundle.get("non_numeric_multiclass_reason")
|
|
49
70
|
else:
|
|
71
|
+
# Multiclass classification: few unique values and integer encoding
|
|
50
72
|
if target.nunique() <= 50 and is_int_encoding(target.unique()):
|
|
51
73
|
task = ModelTaskType.MULTICLASS
|
|
74
|
+
reason = bundle.get("few_unique_label_multiclass_reason")
|
|
75
|
+
# Regression case: if there is date, assume regression
|
|
52
76
|
elif has_date:
|
|
53
77
|
task = ModelTaskType.REGRESSION
|
|
78
|
+
reason = bundle.get("date_search_key_regression_reason")
|
|
54
79
|
else:
|
|
80
|
+
# Remove zero values and recalculate unique ratio
|
|
55
81
|
non_zero_target = target[target != 0]
|
|
56
82
|
target_items = non_zero_target.nunique()
|
|
57
83
|
target_ratio = target_items / len(non_zero_target)
|
|
84
|
+
|
|
85
|
+
# Use unique_ratio to determine whether to classify as regression or multiclass
|
|
58
86
|
if (
|
|
59
|
-
(target.dtype.kind == "f" and np.any(target != target.astype(int))) #
|
|
87
|
+
(target.dtype.kind == "f" and np.any(target != target.astype(int))) # Non-integer float values
|
|
60
88
|
or target_items > 50
|
|
61
|
-
or target_ratio > 0.2
|
|
89
|
+
or target_ratio > 0.2 # If non-zero values have high ratio of uniqueness
|
|
62
90
|
):
|
|
63
91
|
task = ModelTaskType.REGRESSION
|
|
92
|
+
reason = bundle.get("many_unique_label_regression_reason")
|
|
64
93
|
else:
|
|
65
94
|
task = ModelTaskType.MULTICLASS
|
|
95
|
+
reason = bundle.get("limited_int_multiclass_reason")
|
|
66
96
|
|
|
67
|
-
|
|
97
|
+
# Log or print the reason for the selected task type
|
|
98
|
+
logger.info(f"Detected task type: {task} (Reason: {reason})")
|
|
99
|
+
|
|
100
|
+
# Print task type and reason if silent mode is off
|
|
68
101
|
if not silent:
|
|
69
|
-
print(bundle.get("target_type_detected").format(task))
|
|
102
|
+
print(bundle.get("target_type_detected").format(task, reason))
|
|
103
|
+
|
|
70
104
|
return task
|
|
71
105
|
|
|
72
106
|
|
|
@@ -81,8 +115,8 @@ def balance_undersample(
|
|
|
81
115
|
target_column: str,
|
|
82
116
|
task_type: ModelTaskType,
|
|
83
117
|
random_state: int,
|
|
84
|
-
|
|
85
|
-
|
|
118
|
+
binary_min_sample_threshold: int = 5000,
|
|
119
|
+
multiclass_min_sample_threshold: int = 25000,
|
|
86
120
|
binary_bootstrap_loops: int = 5,
|
|
87
121
|
multiclass_bootstrap_loops: int = 2,
|
|
88
122
|
logger: Optional[logging.Logger] = None,
|
|
@@ -96,52 +130,60 @@ def balance_undersample(
|
|
|
96
130
|
if SYSTEM_RECORD_ID not in df.columns:
|
|
97
131
|
raise Exception("System record id must be presented for undersampling")
|
|
98
132
|
|
|
99
|
-
count = len(df)
|
|
133
|
+
# count = len(df)
|
|
100
134
|
target = df[target_column].copy()
|
|
101
|
-
target_classes_count = target.nunique()
|
|
135
|
+
# target_classes_count = target.nunique()
|
|
102
136
|
|
|
103
137
|
vc = target.value_counts()
|
|
104
138
|
max_class_value = vc.index[0]
|
|
105
139
|
min_class_value = vc.index[len(vc) - 1]
|
|
106
140
|
max_class_count = vc[max_class_value]
|
|
107
141
|
min_class_count = vc[min_class_value]
|
|
142
|
+
num_classes = len(vc)
|
|
108
143
|
|
|
109
|
-
min_class_percent = imbalance_threshold / target_classes_count
|
|
110
|
-
min_class_threshold = int(min_class_percent * count)
|
|
144
|
+
# min_class_percent = imbalance_threshold / target_classes_count
|
|
145
|
+
# min_class_threshold = int(min_class_percent * count)
|
|
111
146
|
|
|
112
147
|
resampled_data = df
|
|
113
148
|
df = df.copy().sort_values(by=SYSTEM_RECORD_ID)
|
|
114
149
|
if task_type == ModelTaskType.MULTICLASS:
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
if max_class_count > (quantile25_class_cnt * multiclass_bootstrap_loops):
|
|
122
|
-
msg = bundle.get("imbalance_multiclass").format(quantile25_class, quantile25_class_cnt)
|
|
150
|
+
if len(df) > multiclass_min_sample_threshold and max_class_count > (
|
|
151
|
+
min_class_count * multiclass_bootstrap_loops
|
|
152
|
+
):
|
|
153
|
+
|
|
154
|
+
# msg = bundle.get("imbalance_multiclass").format(min_class_value, min_class_count)
|
|
155
|
+
msg = bundle.get("imbalanced_target").format(min_class_value, min_class_count)
|
|
123
156
|
logger.warning(msg)
|
|
124
157
|
print(msg)
|
|
125
158
|
if warning_counter:
|
|
126
159
|
warning_counter.increment()
|
|
127
160
|
|
|
128
|
-
# 25% and lower classes will stay as is. Higher classes will be downsampled
|
|
129
161
|
sample_strategy = dict()
|
|
130
|
-
for
|
|
131
|
-
|
|
132
|
-
|
|
162
|
+
for class_value in vc.index:
|
|
163
|
+
if class_value == min_class_value:
|
|
164
|
+
continue
|
|
133
165
|
class_count = vc[class_value]
|
|
134
|
-
|
|
166
|
+
sample_size = min(
|
|
167
|
+
class_count,
|
|
168
|
+
multiclass_bootstrap_loops
|
|
169
|
+
* (
|
|
170
|
+
min_class_count
|
|
171
|
+
+ max((multiclass_min_sample_threshold - num_classes * min_class_count) / (num_classes - 1), 0)
|
|
172
|
+
),
|
|
173
|
+
)
|
|
174
|
+
sample_strategy[class_value] = int(sample_size)
|
|
175
|
+
logger.info(f"Rebalance sample strategy: {sample_strategy}. Min class count: {min_class_count}")
|
|
135
176
|
sampler = RandomUnderSampler(sampling_strategy=sample_strategy, random_state=random_state)
|
|
136
177
|
X = df[SYSTEM_RECORD_ID]
|
|
137
178
|
X = X.to_frame(SYSTEM_RECORD_ID)
|
|
138
179
|
new_x, _ = sampler.fit_resample(X, target) # type: ignore
|
|
139
180
|
|
|
140
181
|
resampled_data = df[df[SYSTEM_RECORD_ID].isin(new_x[SYSTEM_RECORD_ID])]
|
|
141
|
-
elif len(df) >
|
|
142
|
-
msg = bundle.get("dataset_rarest_class_less_threshold").format(
|
|
143
|
-
|
|
144
|
-
)
|
|
182
|
+
elif len(df) > binary_min_sample_threshold:
|
|
183
|
+
# msg = bundle.get("dataset_rarest_class_less_threshold").format(
|
|
184
|
+
# min_class_value, min_class_count, min_class_threshold, min_class_percent * 100
|
|
185
|
+
# )
|
|
186
|
+
msg = bundle.get("imbalanced_target").format(min_class_value, min_class_count)
|
|
145
187
|
logger.warning(msg)
|
|
146
188
|
print(msg)
|
|
147
189
|
if warning_counter:
|
|
@@ -150,30 +192,38 @@ def balance_undersample(
|
|
|
150
192
|
# fill up to min_sample_threshold by majority class
|
|
151
193
|
minority_class = df[df[target_column] == min_class_value]
|
|
152
194
|
majority_class = df[df[target_column] != min_class_value]
|
|
153
|
-
sample_size = min(len(majority_class), min_sample_threshold - min_class_count)
|
|
195
|
+
# sample_size = min(len(majority_class), min_sample_threshold - min_class_count)
|
|
196
|
+
sample_size = min(
|
|
197
|
+
max_class_count,
|
|
198
|
+
binary_bootstrap_loops * (min_class_count + max(binary_min_sample_threshold - 2 * min_class_count, 0)),
|
|
199
|
+
)
|
|
200
|
+
logger.info(
|
|
201
|
+
f"Min class count: {min_class_count}. Max class count: {max_class_count}."
|
|
202
|
+
f" Rebalance sample size: {sample_size}"
|
|
203
|
+
)
|
|
154
204
|
sampled_majority_class = majority_class.sample(n=sample_size, random_state=random_state)
|
|
155
205
|
resampled_data = df[
|
|
156
206
|
(df[SYSTEM_RECORD_ID].isin(minority_class[SYSTEM_RECORD_ID]))
|
|
157
207
|
| (df[SYSTEM_RECORD_ID].isin(sampled_majority_class[SYSTEM_RECORD_ID]))
|
|
158
208
|
]
|
|
159
209
|
|
|
160
|
-
elif max_class_count > min_class_count * binary_bootstrap_loops:
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
210
|
+
# elif max_class_count > min_class_count * binary_bootstrap_loops:
|
|
211
|
+
# msg = bundle.get("dataset_rarest_class_less_threshold").format(
|
|
212
|
+
# min_class_value, min_class_count, min_class_threshold, min_class_percent * 100
|
|
213
|
+
# )
|
|
214
|
+
# logger.warning(msg)
|
|
215
|
+
# print(msg)
|
|
216
|
+
# if warning_counter:
|
|
217
|
+
# warning_counter.increment()
|
|
218
|
+
|
|
219
|
+
# sampler = RandomUnderSampler(
|
|
220
|
+
# sampling_strategy={max_class_value: binary_bootstrap_loops * min_class_count}, random_state=random_state
|
|
221
|
+
# )
|
|
222
|
+
# X = df[SYSTEM_RECORD_ID]
|
|
223
|
+
# X = X.to_frame(SYSTEM_RECORD_ID)
|
|
224
|
+
# new_x, _ = sampler.fit_resample(X, target) # type: ignore
|
|
225
|
+
|
|
226
|
+
# resampled_data = df[df[SYSTEM_RECORD_ID].isin(new_x[SYSTEM_RECORD_ID])]
|
|
177
227
|
|
|
178
228
|
logger.info(f"Shape after rebalance resampling: {resampled_data}")
|
|
179
229
|
return resampled_data
|
|
@@ -1,9 +1,9 @@
|
|
|
1
|
-
upgini/__about__.py,sha256=
|
|
1
|
+
upgini/__about__.py,sha256=Q6rDLuL8XHKQggYBtRCtxzpPQJgFYWn4x0gcVlH7H4g,23
|
|
2
2
|
upgini/__init__.py,sha256=M64LwQTBa-5Jz24Zm2h8rWwlKQQ1J8nP7gGgIciS0WU,589
|
|
3
3
|
upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
|
|
4
|
-
upgini/dataset.py,sha256=
|
|
4
|
+
upgini/dataset.py,sha256=iPFiMJtk4HF1ytw9wCQr8H9RfoOKj_TIo8XYZKWgcMc,31331
|
|
5
5
|
upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
|
|
6
|
-
upgini/features_enricher.py,sha256=
|
|
6
|
+
upgini/features_enricher.py,sha256=vRC7g6n6XQxSrvzXk6NJjP0ZytDQhWR4sTAo4Hp7gmA,188319
|
|
7
7
|
upgini/http.py,sha256=21asexflvavydzCOONJDGQBtQanCElrbnqLXakJ9Cu8,42880
|
|
8
8
|
upgini/lazy_import.py,sha256=74gQ8JuA48BGRLxAo7lNHNKY2D2emMxrUxKGdxVGhuY,1012
|
|
9
9
|
upgini/metadata.py,sha256=osmzdNESeh7yP3BZday6N9Q3eaIHfzhhRM1d6NSgcf0,11223
|
|
@@ -27,10 +27,10 @@ upgini/data_source/data_source_publisher.py,sha256=X-8aGtVgzGmxyXkMVBoBLIGDMb4lY
|
|
|
27
27
|
upgini/mdc/__init__.py,sha256=aM08nIWFc2gWdWUa3_IuEnNND0cQPkBGnYpRMnfFN8k,1019
|
|
28
28
|
upgini/mdc/context.py,sha256=3u1B-jXt7tXEvNcV3qmR9SDCseudnY7KYsLclBdwVLk,1405
|
|
29
29
|
upgini/normalizer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
30
|
-
upgini/normalizer/normalize_utils.py,sha256=
|
|
30
|
+
upgini/normalizer/normalize_utils.py,sha256=Lv75lq7M46z9cAIutwkdKZtPZkWblgoRzToAJ1BwY8A,7709
|
|
31
31
|
upgini/resource_bundle/__init__.py,sha256=S5F2G47pnJd2LDpmFsjDqEwiKkP8Hm-hcseDbMka6Ko,8345
|
|
32
32
|
upgini/resource_bundle/exceptions.py,sha256=5fRvx0_vWdE1-7HcSgF0tckB4A9AKyf5RiinZkInTsI,621
|
|
33
|
-
upgini/resource_bundle/strings.properties,sha256=
|
|
33
|
+
upgini/resource_bundle/strings.properties,sha256=eqJP6bGu12zFuQJqMY03QbMhppcdwIfL2bsJWaqmuZ4,27221
|
|
34
34
|
upgini/resource_bundle/strings_widget.properties,sha256=gOdqvZWntP2LCza_tyVk1_yRYcG4c04K9sQOAVhF_gw,1577
|
|
35
35
|
upgini/sampler/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
36
36
|
upgini/sampler/base.py,sha256=7GpjYqjOp58vYcJLiX__1R5wjUlyQbxvHJ2klFnup_M,6389
|
|
@@ -54,10 +54,10 @@ upgini/utils/phone_utils.py,sha256=IrbztLuOJBiePqqxllfABWfYlfAjYevPhXKipl95wUI,1
|
|
|
54
54
|
upgini/utils/postal_code_utils.py,sha256=5M0sUqH2DAr33kARWCTXR-ACyzWbjDq_-0mmEml6ZcU,1716
|
|
55
55
|
upgini/utils/progress_bar.py,sha256=N-Sfdah2Hg8lXP_fV9EfUTXz_PyRt4lo9fAHoUDOoLc,1550
|
|
56
56
|
upgini/utils/sklearn_ext.py,sha256=13jQS_k7v0aUtudXV6nGUEWjttPQzAW9AFYL5wgEz9k,44511
|
|
57
|
-
upgini/utils/target_utils.py,sha256=
|
|
57
|
+
upgini/utils/target_utils.py,sha256=qHzZRmICFbLNCrmVqGkaBcjm91L2ERRZMppci36acV4,10085
|
|
58
58
|
upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
|
|
59
59
|
upgini/utils/warning_counter.py,sha256=dIWBB4dI5XRRJZudvIlqlIYKEiwLLPcXarsZuYRt338,227
|
|
60
|
-
upgini-1.2.
|
|
61
|
-
upgini-1.2.
|
|
62
|
-
upgini-1.2.
|
|
63
|
-
upgini-1.2.
|
|
60
|
+
upgini-1.2.15.dist-info/METADATA,sha256=Hua2FUNftyzzpi9eR090MFJ-5F8S_KS_5SrZhwOUgco,48577
|
|
61
|
+
upgini-1.2.15.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
|
|
62
|
+
upgini-1.2.15.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
|
|
63
|
+
upgini-1.2.15.dist-info/RECORD,,
|
|
File without changes
|