upgini 1.1.262a3250.post3__py3-none-any.whl → 1.1.274a4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- upgini/autofe/all_operands.py +12 -2
- upgini/autofe/date.py +68 -8
- upgini/autofe/feature.py +1 -1
- upgini/data_source/data_source_publisher.py +24 -5
- upgini/dataset.py +21 -58
- upgini/features_enricher.py +114 -40
- upgini/fingerprint.js +8 -0
- upgini/metrics.py +58 -7
- upgini/normalizer/phone_normalizer.py +2 -2
- upgini/resource_bundle/strings.properties +8 -3
- upgini/search_task.py +1 -1
- upgini/utils/datetime_utils.py +53 -2
- upgini/utils/deduplicate_utils.py +61 -18
- upgini/utils/sklearn_ext.py +1 -2
- upgini/utils/target_utils.py +125 -2
- {upgini-1.1.262a3250.post3.dist-info → upgini-1.1.274a4.dist-info}/METADATA +2 -2
- {upgini-1.1.262a3250.post3.dist-info → upgini-1.1.274a4.dist-info}/RECORD +20 -19
- {upgini-1.1.262a3250.post3.dist-info → upgini-1.1.274a4.dist-info}/LICENSE +0 -0
- {upgini-1.1.262a3250.post3.dist-info → upgini-1.1.274a4.dist-info}/WHEEL +0 -0
- {upgini-1.1.262a3250.post3.dist-info → upgini-1.1.274a4.dist-info}/top_level.txt +0 -0
|
@@ -38,6 +38,7 @@ loss_selection_warn=\nWARNING: Loss `{0}` is not supported for feature selection
|
|
|
38
38
|
loss_calc_metrics_warn=\nWARNING: Loss `{0}` is not supported for metrics calculation with {1}
|
|
39
39
|
multivariate_timeseries_detected=\nWARNING: Multivariate TimeSeries detected. Blocked time series cross-validation split selected.\nMore details: https://github.com/upgini/upgini#-time-series-prediction-support
|
|
40
40
|
group_k_fold_in_classification=\nWARNING: Using group K-fold cross-validation split for classification task.
|
|
41
|
+
current_date_added=\nWARNING: No date/datetime column was detected in X to be used as a search key. The current date will be used to match the latest version of data sources
|
|
41
42
|
|
|
42
43
|
# Errors
|
|
43
44
|
failed_search_by_task_id=Failed to retrieve the specified search results
|
|
@@ -111,6 +112,9 @@ x_is_empty=X is empty
|
|
|
111
112
|
y_is_empty=y is empty
|
|
112
113
|
x_contains_reserved_column_name=Column name {} is reserved. Please rename column and try again
|
|
113
114
|
missing_generate_feature=\nWARNING: Feature {} specified in `generate_features` is not present in input columns: {}
|
|
115
|
+
x_unstable_by_date=\nWARNING: Your training sample is unstable in number of rows per date. It is recommended to redesign the training sample
|
|
116
|
+
train_unstable_target=\nWARNING: Your training sample contains an unstable target event, PSI = {}. This will lead to unstable scoring on deferred samples. It is recommended to redesign the training sample
|
|
117
|
+
eval_unstable_target=\nWARNING: Your training and evaluation samples have a difference in target distribution. PSI = {}. The results will be unstable. It is recommended to redesign the training and evaluation samples
|
|
114
118
|
# eval set validation
|
|
115
119
|
unsupported_type_eval_set=Unsupported type of eval_set: {}. It should be list of tuples with two elements: X and y
|
|
116
120
|
eval_set_invalid_tuple_size=eval_set contains a tuple of size {}. It should contain only pairs of X and y
|
|
@@ -145,7 +149,8 @@ dataset_too_many_rows_registered=X rows limit for transform is {}. Please sample
|
|
|
145
149
|
dataset_empty_column_names=Some column names are empty. Add names please
|
|
146
150
|
dataset_full_duplicates=\nWARNING: {:.5f}% of the rows are fully duplicated
|
|
147
151
|
dataset_diff_target_duplicates=\nWARNING: {:.4f}% of rows ({}) in X and eval_set are duplicates with different y values. These rows will be deleted as incorrect\nIncorrect row indexes: {}
|
|
148
|
-
|
|
152
|
+
dataset_train_diff_target_duplicates_fintech=\nWARNING: {:.4f}% of rows ({}) in X are duplicates, not taking into consideration dates, IP addresses and features from the training set, but have different y values. These rows have been removed to optimize search results.\nRemoved row indexes: {}
|
|
153
|
+
dataset_eval_diff_target_duplicates_fintech=\nWARNING: {:.4f}% of rows ({}) in eval{} X are duplicates, not taking into consideration dates, IP addresses and features from the training set, but have different y values. These rows have been removed to optimize search results.\nRemoved row indexes: {}
|
|
149
154
|
dataset_drop_old_dates=\nWARNING: We don't have data before '2000-01-01' and removed all earlier records from the search dataset
|
|
150
155
|
dataset_all_dates_old=There is empty train dataset after removing data before '2000-01-01'
|
|
151
156
|
dataset_invalid_target_type=Unexpected dtype of target for binary task type: {}. Expected int or bool
|
|
@@ -196,10 +201,10 @@ email_detected=Emails detected in column `{}`. It will be used as a search key\n
|
|
|
196
201
|
email_detected_not_registered=Emails detected in column `{}`. It can be used only with api_key from profile.upgini.com\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns
|
|
197
202
|
phone_detected=Phone numbers detected in column `{}`. It can be used only with api_key from profile.upgini.com\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns
|
|
198
203
|
phone_detected_not_registered=\nWARNING: Phone numbers detected in column `{}`. It can be used only with api_key from profile.upgini.com\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns
|
|
199
|
-
target_type_detected
|
|
204
|
+
target_type_detected=\nDetected task type: {}\n
|
|
200
205
|
# all_ok_community_invite=Chat with us in Slack community:
|
|
201
206
|
all_ok_community_invite=❓ Support request
|
|
202
|
-
too_small_for_metrics=Your train dataset contains less than 500 rows. For such dataset Upgini will not calculate accuracy metrics. Please increase the number of rows in the training dataset to calculate accuracy metrics
|
|
207
|
+
too_small_for_metrics=Your train dataset or one of eval datasets contains less than 500 rows. For such dataset Upgini will not calculate accuracy metrics. Please increase the number of rows in the training dataset to calculate accuracy metrics
|
|
203
208
|
imbalance_multiclass=Class {0} is on 25% quantile of classes distribution ({1} records in train dataset). \nDownsample classes with records more than {1}.
|
|
204
209
|
loss_selection_info=Using loss `{}` for feature selection
|
|
205
210
|
loss_calc_metrics_info=Using loss `{}` for metrics calculation with default estimator
|
upgini/search_task.py
CHANGED
|
@@ -57,7 +57,7 @@ class SearchTask:
|
|
|
57
57
|
if logger is not None:
|
|
58
58
|
self.logger = logger
|
|
59
59
|
else:
|
|
60
|
-
self.logger = logging.getLogger()
|
|
60
|
+
self.logger = logging.getLogger("muted_logger")
|
|
61
61
|
self.logger.setLevel("FATAL")
|
|
62
62
|
self.provider_metadata_v2: Optional[List[ProviderTaskMetadataV2]] = None
|
|
63
63
|
self.unused_features_for_generation: Optional[List[str]] = None
|
upgini/utils/datetime_utils.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import datetime
|
|
2
2
|
import logging
|
|
3
3
|
import re
|
|
4
|
-
from typing import List, Optional
|
|
4
|
+
from typing import Dict, List, Optional
|
|
5
5
|
|
|
6
6
|
import numpy as np
|
|
7
7
|
import pandas as pd
|
|
@@ -9,7 +9,9 @@ from dateutil.relativedelta import relativedelta
|
|
|
9
9
|
from pandas.api.types import is_numeric_dtype, is_period_dtype, is_string_dtype
|
|
10
10
|
|
|
11
11
|
from upgini.errors import ValidationError
|
|
12
|
+
from upgini.metadata import SearchKey
|
|
12
13
|
from upgini.resource_bundle import ResourceBundle, get_custom_bundle
|
|
14
|
+
from upgini.utils.warning_counter import WarningCounter
|
|
13
15
|
|
|
14
16
|
DATE_FORMATS = [
|
|
15
17
|
"%Y-%m-%d",
|
|
@@ -44,7 +46,7 @@ class DateTimeSearchKeyConverter:
|
|
|
44
46
|
if logger is not None:
|
|
45
47
|
self.logger = logger
|
|
46
48
|
else:
|
|
47
|
-
self.logger = logging.getLogger()
|
|
49
|
+
self.logger = logging.getLogger("muted_logger")
|
|
48
50
|
self.logger.setLevel("FATAL")
|
|
49
51
|
self.generated_features: List[str] = []
|
|
50
52
|
self.bundle = bundle or get_custom_bundle()
|
|
@@ -98,6 +100,9 @@ class DateTimeSearchKeyConverter:
|
|
|
98
100
|
msg = self.bundle.get("unsupported_date_type").format(self.date_column)
|
|
99
101
|
self.logger.warning(msg)
|
|
100
102
|
raise ValidationError(msg)
|
|
103
|
+
else:
|
|
104
|
+
df[self.date_column] = df[self.date_column].astype("string").apply(self.clean_date)
|
|
105
|
+
df[self.date_column] = self.parse_date(df)
|
|
101
106
|
|
|
102
107
|
# If column with date is datetime then extract seconds of the day and minute of the hour
|
|
103
108
|
# as additional features
|
|
@@ -225,3 +230,49 @@ def is_blocked_time_series(df: pd.DataFrame, date_col: str, search_keys: List[st
|
|
|
225
230
|
|
|
226
231
|
is_diff_less_than_two_columns = grouped.apply(check_differences)
|
|
227
232
|
return is_diff_less_than_two_columns.all()
|
|
233
|
+
|
|
234
|
+
|
|
235
|
+
def validate_dates_distribution(
|
|
236
|
+
X: pd.DataFrame,
|
|
237
|
+
search_keys: Dict[str, SearchKey],
|
|
238
|
+
logger: Optional[logging.Logger] = None,
|
|
239
|
+
bundle: Optional[ResourceBundle] = None,
|
|
240
|
+
warning_counter: Optional[WarningCounter] = None,
|
|
241
|
+
):
|
|
242
|
+
maybe_date_col = None
|
|
243
|
+
for key, key_type in search_keys.items():
|
|
244
|
+
if key_type in [SearchKey.DATE, SearchKey.DATETIME]:
|
|
245
|
+
maybe_date_col = key
|
|
246
|
+
|
|
247
|
+
if maybe_date_col is None:
|
|
248
|
+
for col in X.columns:
|
|
249
|
+
if col in search_keys:
|
|
250
|
+
continue
|
|
251
|
+
try:
|
|
252
|
+
pd.to_datetime(X[col])
|
|
253
|
+
maybe_date_col = col
|
|
254
|
+
break
|
|
255
|
+
except Exception:
|
|
256
|
+
pass
|
|
257
|
+
|
|
258
|
+
if maybe_date_col is None:
|
|
259
|
+
return
|
|
260
|
+
|
|
261
|
+
dates = pd.to_datetime(X[maybe_date_col]).dt.date
|
|
262
|
+
|
|
263
|
+
date_counts = dates.value_counts().sort_index()
|
|
264
|
+
|
|
265
|
+
date_counts_1 = date_counts[: round(len(date_counts) / 2)]
|
|
266
|
+
date_counts_2 = date_counts[round(len(date_counts) / 2) :]
|
|
267
|
+
ratio = date_counts_2.mean() / date_counts_1.mean()
|
|
268
|
+
|
|
269
|
+
if ratio > 1.2 or ratio < 0.8:
|
|
270
|
+
if warning_counter is not None:
|
|
271
|
+
warning_counter.increment()
|
|
272
|
+
if logger is None:
|
|
273
|
+
logger = logging.getLogger("muted_logger")
|
|
274
|
+
logger.setLevel("FATAL")
|
|
275
|
+
bundle = bundle or get_custom_bundle()
|
|
276
|
+
msg = bundle.get("x_unstable_by_date")
|
|
277
|
+
print(msg)
|
|
278
|
+
logger.warning(msg)
|
|
@@ -3,7 +3,7 @@ from typing import Dict, List, Optional, Union
|
|
|
3
3
|
|
|
4
4
|
import pandas as pd
|
|
5
5
|
|
|
6
|
-
from upgini.metadata import SORT_ID, SYSTEM_RECORD_ID, TARGET, ModelTaskType, SearchKey
|
|
6
|
+
from upgini.metadata import EVAL_SET_INDEX, SORT_ID, SYSTEM_RECORD_ID, TARGET, ModelTaskType, SearchKey
|
|
7
7
|
from upgini.resource_bundle import ResourceBundle
|
|
8
8
|
from upgini.utils.datetime_utils import DateTimeSearchKeyConverter
|
|
9
9
|
from upgini.utils.target_utils import define_task
|
|
@@ -78,20 +78,58 @@ def remove_fintech_duplicates(
|
|
|
78
78
|
rows_with_diff_target = grouped_by_personal_cols.filter(has_diff_target_within_60_days)
|
|
79
79
|
if len(rows_with_diff_target) > 0:
|
|
80
80
|
unique_keys_to_delete = rows_with_diff_target[personal_cols].drop_duplicates()
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
logger
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
81
|
+
if EVAL_SET_INDEX not in df.columns:
|
|
82
|
+
rows_to_remove = pd.merge(df.reset_index(), unique_keys_to_delete, on=personal_cols)
|
|
83
|
+
rows_to_remove = rows_to_remove.set_index(df.index.name or "index")
|
|
84
|
+
perc = len(rows_to_remove) * 100 / len(df)
|
|
85
|
+
msg = bundle.get("dataset_train_diff_target_duplicates_fintech").format(
|
|
86
|
+
perc, len(rows_to_remove), rows_to_remove.index.to_list()
|
|
87
|
+
)
|
|
88
|
+
if not silent:
|
|
89
|
+
print(msg)
|
|
90
|
+
if logger:
|
|
91
|
+
logger.warning(msg)
|
|
92
|
+
logger.info(f"Dataset shape before clean fintech duplicates: {df.shape}")
|
|
93
|
+
df = df[~df.index.isin(rows_to_remove.index)]
|
|
94
|
+
logger.info(f"Dataset shape after clean fintech duplicates: {df.shape}")
|
|
95
|
+
else:
|
|
96
|
+
# Indices in train and eval_set can be the same so we remove rows from them separately
|
|
97
|
+
train = df.query(f"{EVAL_SET_INDEX} == 0")
|
|
98
|
+
train_rows_to_remove = pd.merge(train.reset_index(), unique_keys_to_delete, on=personal_cols)
|
|
99
|
+
train_rows_to_remove = train_rows_to_remove.set_index(train.index.name or "index")
|
|
100
|
+
train_perc = len(train_rows_to_remove) * 100 / len(train)
|
|
101
|
+
msg = bundle.get("dataset_train_diff_target_duplicates_fintech").format(
|
|
102
|
+
train_perc, len(train_rows_to_remove), train_rows_to_remove.index.to_list()
|
|
103
|
+
)
|
|
104
|
+
if not silent:
|
|
105
|
+
print(msg)
|
|
106
|
+
if logger:
|
|
107
|
+
logger.warning(msg)
|
|
108
|
+
logger.info(f"Train dataset shape before clean fintech duplicates: {train.shape}")
|
|
109
|
+
train = train[~train.index.isin(train_rows_to_remove.index)]
|
|
110
|
+
logger.info(f"Train dataset shape after clean fintech duplicates: {train.shape}")
|
|
111
|
+
|
|
112
|
+
evals = [df.query(f"{EVAL_SET_INDEX} == {i}") for i in df[EVAL_SET_INDEX].unique() if i != 0]
|
|
113
|
+
new_evals = []
|
|
114
|
+
for i, eval in enumerate(evals):
|
|
115
|
+
eval_rows_to_remove = pd.merge(eval.reset_index(), unique_keys_to_delete, on=personal_cols)
|
|
116
|
+
eval_rows_to_remove = eval_rows_to_remove.set_index(eval.index.name or "index")
|
|
117
|
+
eval_perc = len(eval_rows_to_remove) * 100 / len(eval)
|
|
118
|
+
msg = bundle.get("dataset_eval_diff_target_duplicates_fintech").format(
|
|
119
|
+
eval_perc, len(eval_rows_to_remove), i + 1, eval_rows_to_remove.index.to_list()
|
|
120
|
+
)
|
|
121
|
+
if not silent:
|
|
122
|
+
print(msg)
|
|
123
|
+
if logger:
|
|
124
|
+
logger.warning(msg)
|
|
125
|
+
logger.info(f"Eval {i + 1} dataset shape before clean fintech duplicates: {eval.shape}")
|
|
126
|
+
eval = eval[~eval.index.isin(eval_rows_to_remove.index)]
|
|
127
|
+
logger.info(f"Eval {i + 1} dataset shape after clean fintech duplicates: {eval.shape}")
|
|
128
|
+
new_evals.append(eval)
|
|
129
|
+
|
|
130
|
+
logger.info(f"Dataset shape before clean fintech duplicates: {df.shape}")
|
|
131
|
+
df = pd.concat([train] + new_evals)
|
|
132
|
+
logger.info(f"Dataset shape after clean fintech duplicates: {df.shape}")
|
|
95
133
|
return df
|
|
96
134
|
|
|
97
135
|
|
|
@@ -101,14 +139,18 @@ def clean_full_duplicates(
|
|
|
101
139
|
nrows = len(df)
|
|
102
140
|
if nrows == 0:
|
|
103
141
|
return df
|
|
104
|
-
# Remove
|
|
142
|
+
# Remove full duplicates (exclude system_record_id, sort_id and eval_set_index)
|
|
105
143
|
unique_columns = df.columns.tolist()
|
|
106
144
|
if SYSTEM_RECORD_ID in unique_columns:
|
|
107
145
|
unique_columns.remove(SYSTEM_RECORD_ID)
|
|
108
146
|
if SORT_ID in unique_columns:
|
|
109
147
|
unique_columns.remove(SORT_ID)
|
|
148
|
+
if EVAL_SET_INDEX in unique_columns:
|
|
149
|
+
unique_columns.remove(EVAL_SET_INDEX)
|
|
110
150
|
logger.info(f"Dataset shape before clean duplicates: {df.shape}")
|
|
111
|
-
|
|
151
|
+
# Train segment goes first so if duplicates are found in train and eval set
|
|
152
|
+
# then we keep unique rows in train segment
|
|
153
|
+
df = df.drop_duplicates(subset=unique_columns, keep="first")
|
|
112
154
|
logger.info(f"Dataset shape after clean duplicates: {df.shape}")
|
|
113
155
|
nrows_after_full_dedup = len(df)
|
|
114
156
|
share_full_dedup = 100 * (1 - nrows_after_full_dedup / nrows)
|
|
@@ -123,7 +165,7 @@ def clean_full_duplicates(
|
|
|
123
165
|
marked_duplicates = df.duplicated(subset=unique_columns, keep=False)
|
|
124
166
|
if marked_duplicates.sum() > 0:
|
|
125
167
|
dups_indices = df[marked_duplicates].index.to_list()
|
|
126
|
-
nrows_after_tgt_dedup = len(df.drop_duplicates(subset=unique_columns))
|
|
168
|
+
nrows_after_tgt_dedup = len(df.drop_duplicates(subset=unique_columns, keep=False))
|
|
127
169
|
num_dup_rows = nrows_after_full_dedup - nrows_after_tgt_dedup
|
|
128
170
|
share_tgt_dedup = 100 * num_dup_rows / nrows_after_full_dedup
|
|
129
171
|
|
|
@@ -133,6 +175,7 @@ def clean_full_duplicates(
|
|
|
133
175
|
print(msg)
|
|
134
176
|
df = df.drop_duplicates(subset=unique_columns, keep=False)
|
|
135
177
|
logger.info(f"Dataset shape after clean invalid target duplicates: {df.shape}")
|
|
178
|
+
|
|
136
179
|
return df
|
|
137
180
|
|
|
138
181
|
|
upgini/utils/sklearn_ext.py
CHANGED
|
@@ -1,5 +1,4 @@
|
|
|
1
1
|
import functools
|
|
2
|
-
import logging
|
|
3
2
|
import numbers
|
|
4
3
|
import time
|
|
5
4
|
import warnings
|
|
@@ -313,7 +312,7 @@ def cross_validate(
|
|
|
313
312
|
|
|
314
313
|
return ret
|
|
315
314
|
except Exception:
|
|
316
|
-
logging.exception("Failed to execute overriden cross_validate. Fallback to original")
|
|
315
|
+
# logging.exception("Failed to execute overriden cross_validate. Fallback to original")
|
|
317
316
|
raise
|
|
318
317
|
# fit_params["use_best_model"] = False
|
|
319
318
|
# return original_cross_validate(
|
upgini/utils/target_utils.py
CHANGED
|
@@ -6,8 +6,10 @@ import pandas as pd
|
|
|
6
6
|
from pandas.api.types import is_numeric_dtype
|
|
7
7
|
|
|
8
8
|
from upgini.errors import ValidationError
|
|
9
|
-
from upgini.metadata import ModelTaskType
|
|
10
|
-
from upgini.resource_bundle import bundle
|
|
9
|
+
from upgini.metadata import SYSTEM_RECORD_ID, ModelTaskType
|
|
10
|
+
from upgini.resource_bundle import ResourceBundle, bundle, get_custom_bundle
|
|
11
|
+
from upgini.sampler.random_under_sampler import RandomUnderSampler
|
|
12
|
+
from upgini.utils.warning_counter import WarningCounter
|
|
11
13
|
|
|
12
14
|
|
|
13
15
|
def correct_string_target(y: Union[pd.Series, np.ndarray]) -> Union[pd.Series, np.ndarray]:
|
|
@@ -72,3 +74,124 @@ def is_int_encoding(unique_values):
|
|
|
72
74
|
return set(unique_values) == set(range(len(unique_values))) or set(unique_values) == set(
|
|
73
75
|
range(1, len(unique_values) + 1)
|
|
74
76
|
)
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def balance_undersample(
|
|
80
|
+
df: pd.DataFrame,
|
|
81
|
+
target_column: str,
|
|
82
|
+
task_type: ModelTaskType,
|
|
83
|
+
random_state: int,
|
|
84
|
+
imbalance_threshold: int = 0.2,
|
|
85
|
+
min_sample_threshold: int = 5000,
|
|
86
|
+
binary_bootstrap_loops: int = 5,
|
|
87
|
+
multiclass_bootstrap_loops: int = 2,
|
|
88
|
+
logger: Optional[logging.Logger] = None,
|
|
89
|
+
bundle: Optional[ResourceBundle] = None,
|
|
90
|
+
warning_counter: Optional[WarningCounter] = None,
|
|
91
|
+
) -> pd.DataFrame:
|
|
92
|
+
if logger is None:
|
|
93
|
+
logger = logging.getLogger("muted_logger")
|
|
94
|
+
logger.setLevel("FATAL")
|
|
95
|
+
bundle = bundle or get_custom_bundle()
|
|
96
|
+
if SYSTEM_RECORD_ID not in df.columns:
|
|
97
|
+
raise Exception("System record id must be presented for undersampling")
|
|
98
|
+
|
|
99
|
+
count = len(df)
|
|
100
|
+
target = df[target_column].copy()
|
|
101
|
+
target_classes_count = target.nunique()
|
|
102
|
+
|
|
103
|
+
vc = target.value_counts()
|
|
104
|
+
max_class_value = vc.index[0]
|
|
105
|
+
min_class_value = vc.index[len(vc) - 1]
|
|
106
|
+
max_class_count = vc[max_class_value]
|
|
107
|
+
min_class_count = vc[min_class_value]
|
|
108
|
+
|
|
109
|
+
min_class_percent = imbalance_threshold / target_classes_count
|
|
110
|
+
min_class_threshold = min_class_percent * count
|
|
111
|
+
|
|
112
|
+
resampled_data = df
|
|
113
|
+
df = df.copy().sort_values(by=SYSTEM_RECORD_ID)
|
|
114
|
+
if task_type == ModelTaskType.MULTICLASS:
|
|
115
|
+
# Sort classes by rows count and find 25% quantile class
|
|
116
|
+
classes = vc.index
|
|
117
|
+
quantile25_idx = int(0.75 * len(classes)) - 1
|
|
118
|
+
quantile25_class = classes[quantile25_idx]
|
|
119
|
+
quantile25_class_cnt = vc[quantile25_class]
|
|
120
|
+
|
|
121
|
+
if max_class_count > (quantile25_class_cnt * multiclass_bootstrap_loops):
|
|
122
|
+
msg = bundle.get("imbalance_multiclass").format(quantile25_class, quantile25_class_cnt)
|
|
123
|
+
logger.warning(msg)
|
|
124
|
+
print(msg)
|
|
125
|
+
if warning_counter:
|
|
126
|
+
warning_counter.increment()
|
|
127
|
+
|
|
128
|
+
# 25% and lower classes will stay as is. Higher classes will be downsampled
|
|
129
|
+
sample_strategy = dict()
|
|
130
|
+
for class_idx in range(quantile25_idx):
|
|
131
|
+
# compare class count with count_of_quantile25_class * 2
|
|
132
|
+
class_value = classes[class_idx]
|
|
133
|
+
class_count = vc[class_value]
|
|
134
|
+
sample_strategy[class_value] = min(class_count, quantile25_class_cnt * multiclass_bootstrap_loops)
|
|
135
|
+
sampler = RandomUnderSampler(sampling_strategy=sample_strategy, random_state=random_state)
|
|
136
|
+
X = df[SYSTEM_RECORD_ID]
|
|
137
|
+
X = X.to_frame(SYSTEM_RECORD_ID)
|
|
138
|
+
new_x, _ = sampler.fit_resample(X, target) # type: ignore
|
|
139
|
+
|
|
140
|
+
resampled_data = df[df[SYSTEM_RECORD_ID].isin(new_x[SYSTEM_RECORD_ID])]
|
|
141
|
+
elif len(df) > min_sample_threshold and min_class_count < min_sample_threshold / 2:
|
|
142
|
+
msg = bundle.get("dataset_rarest_class_less_threshold").format(
|
|
143
|
+
min_class_value, min_class_count, min_class_threshold, min_class_percent * 100
|
|
144
|
+
)
|
|
145
|
+
logger.warning(msg)
|
|
146
|
+
print(msg)
|
|
147
|
+
if warning_counter:
|
|
148
|
+
warning_counter.increment()
|
|
149
|
+
|
|
150
|
+
# fill up to min_sample_threshold by majority class
|
|
151
|
+
minority_class = df[df[target_column] == min_class_value]
|
|
152
|
+
majority_class = df[df[target_column] != min_class_value]
|
|
153
|
+
sample_size = min(len(majority_class), min_sample_threshold - min_class_count)
|
|
154
|
+
sampled_majority_class = majority_class.sample(n=sample_size, random_state=random_state)
|
|
155
|
+
resampled_data = df[
|
|
156
|
+
(df[SYSTEM_RECORD_ID].isin(minority_class[SYSTEM_RECORD_ID]))
|
|
157
|
+
| (df[SYSTEM_RECORD_ID].isin(sampled_majority_class[SYSTEM_RECORD_ID]))
|
|
158
|
+
]
|
|
159
|
+
|
|
160
|
+
elif max_class_count > min_class_count * binary_bootstrap_loops:
|
|
161
|
+
msg = bundle.get("dataset_rarest_class_less_threshold").format(
|
|
162
|
+
min_class_value, min_class_count, min_class_threshold, min_class_percent * 100
|
|
163
|
+
)
|
|
164
|
+
logger.warning(msg)
|
|
165
|
+
print(msg)
|
|
166
|
+
if warning_counter:
|
|
167
|
+
warning_counter.increment()
|
|
168
|
+
|
|
169
|
+
sampler = RandomUnderSampler(
|
|
170
|
+
sampling_strategy={max_class_value: binary_bootstrap_loops * min_class_count}, random_state=random_state
|
|
171
|
+
)
|
|
172
|
+
X = df[SYSTEM_RECORD_ID]
|
|
173
|
+
X = X.to_frame(SYSTEM_RECORD_ID)
|
|
174
|
+
new_x, _ = sampler.fit_resample(X, target) # type: ignore
|
|
175
|
+
|
|
176
|
+
resampled_data = df[df[SYSTEM_RECORD_ID].isin(new_x[SYSTEM_RECORD_ID])]
|
|
177
|
+
|
|
178
|
+
logger.info(f"Shape after rebalance resampling: {resampled_data}")
|
|
179
|
+
return resampled_data
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
def calculate_psi(expected: pd.Series, actual: pd.Series) -> float:
|
|
183
|
+
df = pd.concat([expected, actual])
|
|
184
|
+
|
|
185
|
+
# Define the bins for the target variable
|
|
186
|
+
df_min = df.min()
|
|
187
|
+
df_max = df.max()
|
|
188
|
+
bins = [df_min, (df_min + df_max) / 2, df_max]
|
|
189
|
+
|
|
190
|
+
# Calculate the base distribution
|
|
191
|
+
train_distribution = expected.value_counts(bins=bins, normalize=True).sort_index().values
|
|
192
|
+
|
|
193
|
+
# Calculate the target distribution
|
|
194
|
+
test_distribution = actual.value_counts(bins=bins, normalize=True).sort_index().values
|
|
195
|
+
|
|
196
|
+
# Calculate the PSI
|
|
197
|
+
return np.sum((train_distribution - test_distribution) * np.log(train_distribution / test_distribution))
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: upgini
|
|
3
|
-
Version: 1.1.
|
|
3
|
+
Version: 1.1.274a4
|
|
4
4
|
Summary: Intelligent data search & enrichment for Machine Learning
|
|
5
5
|
Home-page: https://upgini.com/
|
|
6
6
|
Author: Upgini Developers
|
|
@@ -28,7 +28,7 @@ Description-Content-Type: text/markdown
|
|
|
28
28
|
License-File: LICENSE
|
|
29
29
|
Requires-Dist: python-dateutil >=2.8.0
|
|
30
30
|
Requires-Dist: requests >=2.8.0
|
|
31
|
-
Requires-Dist: pandas <2.
|
|
31
|
+
Requires-Dist: pandas <2.1.0,>=1.1.0
|
|
32
32
|
Requires-Dist: numpy >=1.19.0
|
|
33
33
|
Requires-Dist: scikit-learn >=1.3.0
|
|
34
34
|
Requires-Dist: pydantic <2.0.0,>=1.8.2
|
|
@@ -1,34 +1,35 @@
|
|
|
1
1
|
upgini/__init__.py,sha256=asENHgEVHQBIkV-e_0IhE_ZWqkCG6398U3ZLrNzAH6k,407
|
|
2
2
|
upgini/ads.py,sha256=mre6xn44wcC_fg63iLT_kTh4mViZqR9AKRJZAtpQz8Y,2592
|
|
3
|
-
upgini/dataset.py,sha256=
|
|
3
|
+
upgini/dataset.py,sha256=xb4gIANyGbdcuM8Awyq2pJPiH_3k_LEbETApJgAoRBA,45529
|
|
4
4
|
upgini/errors.py,sha256=pdzQl3MKuK52yvncxMWMRWeSIOGhUFzpQoszoRFBOk0,958
|
|
5
|
-
upgini/features_enricher.py,sha256=
|
|
5
|
+
upgini/features_enricher.py,sha256=WDj4DO5lqANBdihEcRmwox4w1kqWVOorlIKY4dbsqrU,175376
|
|
6
|
+
upgini/fingerprint.js,sha256=VygVIQlN1v4NGZfjHqtRogOw8zjTnnMNJg_f7M5iGQU,33442
|
|
6
7
|
upgini/http.py,sha256=zaO86LBBLmkieGbgYifk29eVoPCxXimZQ8YkQtKcM0I,42244
|
|
7
8
|
upgini/metadata.py,sha256=fwVxtkR6Mn4iRoOqV6BfMJvJrx65I3YwZUMbZjhPyOI,9673
|
|
8
|
-
upgini/metrics.py,sha256=
|
|
9
|
-
upgini/search_task.py,sha256=
|
|
9
|
+
upgini/metrics.py,sha256=U3VJKbKmuWACqI4jTcszXo0WqeXFtV8bWyY9VLBL-rw,29129
|
|
10
|
+
upgini/search_task.py,sha256=tmJ17WUxv3J5NWrYUJB_NKdZ792Ifz8Z8UnDXeQnpss,17077
|
|
10
11
|
upgini/spinner.py,sha256=Dm1dQ5F_z_Ua2odLxZX7OypcOX9tSx_vE5MGaKtUmfw,1118
|
|
11
12
|
upgini/version_validator.py,sha256=rDIncP6BEko4J2F2hUcMOtKm_vZbI4ICWcNcw8hrwM4,1400
|
|
12
13
|
upgini/ads_management/__init__.py,sha256=qzyisOToVRP-tquAJD1PblZhNtMrOB8FiyF9JvfkvgE,50
|
|
13
14
|
upgini/ads_management/ads_manager.py,sha256=fP4Yqx3h2Snw5X335TbXEwFoupq1RYsE7y0PAduvetU,2646
|
|
14
15
|
upgini/autofe/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
15
|
-
upgini/autofe/all_operands.py,sha256=
|
|
16
|
+
upgini/autofe/all_operands.py,sha256=H66wqVLD-H9k8A4-q2wslhV9QaNxlb49f8YiT0Xfkps,2356
|
|
16
17
|
upgini/autofe/binary.py,sha256=f8LQqZi9zyaMUAv-jASMmWNA_vT05ncYCjZq0qx3USs,3972
|
|
17
|
-
upgini/autofe/date.py,sha256=
|
|
18
|
-
upgini/autofe/feature.py,sha256=
|
|
18
|
+
upgini/autofe/date.py,sha256=_6RoEJZ5Kf-Q_aMOFucS6YSIZpCcelgpw-edV4qmRIM,3935
|
|
19
|
+
upgini/autofe/feature.py,sha256=2FQRGtIumNz60hFAjfLReaY18SI7HxzYZOoC5avzSjQ,11847
|
|
19
20
|
upgini/autofe/groupby.py,sha256=iXRfOmOc84ooSzRhsh9GmmG7rTafX0-ekXko8s9Qs68,3089
|
|
20
21
|
upgini/autofe/operand.py,sha256=dhtToPDGWtP_0u_RjayUpezJJZAgq_TzNbPH0bI9OXI,2805
|
|
21
22
|
upgini/autofe/unary.py,sha256=YRTzQLttbDdOnkogWBPnBexpu7uHWSLSFAxSCu3iFdY,3145
|
|
22
23
|
upgini/autofe/vector.py,sha256=5qhI_bdwaWM1l7fgCkx1tMt9R9gxWzoYCl-7WO4KiOs,604
|
|
23
24
|
upgini/data_source/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
24
|
-
upgini/data_source/data_source_publisher.py,sha256=
|
|
25
|
+
upgini/data_source/data_source_publisher.py,sha256=taRzyGgrPrTTSGw4Y-Ca5k4bf30aiTa68rxqT9zfqeI,16478
|
|
25
26
|
upgini/mdc/__init__.py,sha256=ETDh3JKbrDdPMOECiYLAa8lvKYe68mv4IY6fZa9FimA,1126
|
|
26
27
|
upgini/mdc/context.py,sha256=Sl1S_InKlzzRxYqwJ2k24lawJdCKWgGJ-RIRfvzWJrk,1468
|
|
27
28
|
upgini/normalizer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
28
|
-
upgini/normalizer/phone_normalizer.py,sha256=
|
|
29
|
+
upgini/normalizer/phone_normalizer.py,sha256=_SYMX4GTgwzRXArK54Jp3vUBE5d4jZxSVyze-0tqzg0,9996
|
|
29
30
|
upgini/resource_bundle/__init__.py,sha256=hdvbqL0b0xMWbY6-kiYGsW1ro2GMiWpxxsO9uCv-h9Q,8379
|
|
30
31
|
upgini/resource_bundle/exceptions.py,sha256=5fRvx0_vWdE1-7HcSgF0tckB4A9AKyf5RiinZkInTsI,621
|
|
31
|
-
upgini/resource_bundle/strings.properties,sha256=
|
|
32
|
+
upgini/resource_bundle/strings.properties,sha256=x-2fXtGc5Z2n7eUg9b6I4yhok56TTXDvzwU1JUaKcj4,26285
|
|
32
33
|
upgini/resource_bundle/strings_widget.properties,sha256=gOdqvZWntP2LCza_tyVk1_yRYcG4c04K9sQOAVhF_gw,1577
|
|
33
34
|
upgini/sampler/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
34
35
|
upgini/sampler/base.py,sha256=CC-DvPbrN7zp5--SVFuUqkVmdWM_5F7R0Do98ETV82U,6421
|
|
@@ -40,8 +41,8 @@ upgini/utils/blocked_time_series.py,sha256=dMz5ewk3PsoeOrc3lDzInCVPS9u_2XQkV0W6P
|
|
|
40
41
|
upgini/utils/country_utils.py,sha256=1KXhLSNqkNYVL3on8-zK0Arc_SspUH7AMZvGZICysOU,6462
|
|
41
42
|
upgini/utils/custom_loss_utils.py,sha256=DBslpjWGPt7xTeypt78baR59012SYphbPsO_YLKdilo,3972
|
|
42
43
|
upgini/utils/cv_utils.py,sha256=Tn01RJvpZGZh0PUQUimlBkV-AXwe7s6yjCNFtw352Uc,3525
|
|
43
|
-
upgini/utils/datetime_utils.py,sha256=
|
|
44
|
-
upgini/utils/deduplicate_utils.py,sha256=
|
|
44
|
+
upgini/utils/datetime_utils.py,sha256=XciFOIYI4Zi7PqQS8dHxuPDEtdtwXbOrWsiAa04v2J4,10511
|
|
45
|
+
upgini/utils/deduplicate_utils.py,sha256=6AbARehUCghJZ4PppFtrej2s3gFRruh41MEm6mzakHs,8607
|
|
45
46
|
upgini/utils/display_utils.py,sha256=LKoSwjrE0xgS5_cqVhc2og2CQ1UCZ1nTI2VKboIhoQA,10858
|
|
46
47
|
upgini/utils/email_utils.py,sha256=3CvHXTSzlgLyGsQOXfRYVfFhfPy6OXG4uXOBWRaLfHg,3479
|
|
47
48
|
upgini/utils/fallback_progress_bar.py,sha256=cdbd1XGcWm4Ed4eAqV2_St3z7uC_kkH22gEyrN5ub6M,1090
|
|
@@ -51,12 +52,12 @@ upgini/utils/ip_utils.py,sha256=Zf3F2cnQmOCH09QLQHetpjMFu1PnD0cTmDymn0SnSy8,1672
|
|
|
51
52
|
upgini/utils/phone_utils.py,sha256=JNSkF8G6mgsN8Czy11pamaJdsY6rBINEMpi7jbVt_RA,408
|
|
52
53
|
upgini/utils/postal_code_utils.py,sha256=_8CR9tBqsPptQsmMUvnrCAmBaMIQSWH3JfJ4ly3x_zs,409
|
|
53
54
|
upgini/utils/progress_bar.py,sha256=iNXyqT3vKCeHpfiG5HHwr7Lk2cTtKViM93Fl8iZnjGc,1564
|
|
54
|
-
upgini/utils/sklearn_ext.py,sha256=
|
|
55
|
-
upgini/utils/target_utils.py,sha256=
|
|
55
|
+
upgini/utils/sklearn_ext.py,sha256=e1aMNXk1zUt7uFnl0FcUF0zOnaXSE7z5xBHmJPknUVs,44014
|
|
56
|
+
upgini/utils/target_utils.py,sha256=9K67tkY7LWhQMO-vbbPqBaO-KriAmg_6fVz5RQRaLQc,7802
|
|
56
57
|
upgini/utils/track_info.py,sha256=EPcJ13Jqa17_T0JjM37Ac9kWDz5Zk0GVsIZKutOb8aU,5207
|
|
57
58
|
upgini/utils/warning_counter.py,sha256=dIWBB4dI5XRRJZudvIlqlIYKEiwLLPcXarsZuYRt338,227
|
|
58
|
-
upgini-1.1.
|
|
59
|
-
upgini-1.1.
|
|
60
|
-
upgini-1.1.
|
|
61
|
-
upgini-1.1.
|
|
62
|
-
upgini-1.1.
|
|
59
|
+
upgini-1.1.274a4.dist-info/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
|
|
60
|
+
upgini-1.1.274a4.dist-info/METADATA,sha256=xng0cJvEGeFT2zSBqLDy-qf9I6ONKxdKtXsFWokPpPs,48158
|
|
61
|
+
upgini-1.1.274a4.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
|
62
|
+
upgini-1.1.274a4.dist-info/top_level.txt,sha256=OFhTGiDIWKl5gFI49qvWq1R9IKflPaE2PekcbDXDtx4,7
|
|
63
|
+
upgini-1.1.274a4.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|