upgini 1.1.252a5__py3-none-any.whl → 1.1.253__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- upgini/data_source/data_source_publisher.py +1 -1
- upgini/dataset.py +63 -67
- upgini/features_enricher.py +106 -47
- upgini/search_task.py +1 -1
- upgini/utils/__init__.py +14 -0
- upgini/utils/datetime_utils.py +1 -1
- upgini/utils/deduplicate_utils.py +13 -17
- upgini/utils/features_validator.py +9 -1
- upgini/utils/target_utils.py +2 -111
- {upgini-1.1.252a5.dist-info → upgini-1.1.253.dist-info}/METADATA +1 -1
- {upgini-1.1.252a5.dist-info → upgini-1.1.253.dist-info}/RECORD +14 -14
- {upgini-1.1.252a5.dist-info → upgini-1.1.253.dist-info}/LICENSE +0 -0
- {upgini-1.1.252a5.dist-info → upgini-1.1.253.dist-info}/WHEEL +0 -0
- {upgini-1.1.252a5.dist-info → upgini-1.1.253.dist-info}/top_level.txt +0 -0
upgini/dataset.py
CHANGED
|
@@ -15,9 +15,9 @@ from pandas.api.types import (
|
|
|
15
15
|
is_float_dtype,
|
|
16
16
|
is_integer_dtype,
|
|
17
17
|
is_numeric_dtype,
|
|
18
|
+
is_period_dtype,
|
|
18
19
|
is_string_dtype,
|
|
19
20
|
)
|
|
20
|
-
from pandas.core.dtypes.common import is_period_dtype
|
|
21
21
|
|
|
22
22
|
from upgini.errors import ValidationError
|
|
23
23
|
from upgini.http import ProgressStage, SearchProgress, _RestClient
|
|
@@ -39,10 +39,10 @@ from upgini.metadata import (
|
|
|
39
39
|
)
|
|
40
40
|
from upgini.normalizer.phone_normalizer import PhoneNormalizer
|
|
41
41
|
from upgini.resource_bundle import ResourceBundle, get_custom_bundle
|
|
42
|
+
from upgini.sampler.random_under_sampler import RandomUnderSampler
|
|
42
43
|
from upgini.search_task import SearchTask
|
|
43
|
-
from upgini.utils import combine_search_keys
|
|
44
|
+
from upgini.utils import combine_search_keys, find_numbers_with_decimal_comma
|
|
44
45
|
from upgini.utils.email_utils import EmailSearchKeyConverter
|
|
45
|
-
from upgini.utils.target_utils import balance_undersample
|
|
46
46
|
|
|
47
47
|
try:
|
|
48
48
|
from upgini.utils.progress_bar import CustomProgressBar as ProgressBar
|
|
@@ -61,8 +61,6 @@ class Dataset: # (pd.DataFrame):
|
|
|
61
61
|
FIT_SAMPLE_WITH_EVAL_SET_THRESHOLD = 200_000
|
|
62
62
|
MIN_SAMPLE_THRESHOLD = 5_000
|
|
63
63
|
IMBALANCE_THESHOLD = 0.4
|
|
64
|
-
BINARY_BOOTSTRAP_LOOPS = 5
|
|
65
|
-
MULTICLASS_BOOTSTRAP_LOOPS = 2
|
|
66
64
|
MIN_TARGET_CLASS_ROWS = 100
|
|
67
65
|
MAX_MULTICLASS_CLASS_COUNT = 100
|
|
68
66
|
MIN_SUPPORTED_DATE_TS = 946684800000 # 2000-01-01
|
|
@@ -224,45 +222,6 @@ class Dataset: # (pd.DataFrame):
|
|
|
224
222
|
if max_length > self.MAX_STRING_FEATURE_LENGTH:
|
|
225
223
|
self.data[col] = self.data[col].astype("str").str.slice(stop=self.MAX_STRING_FEATURE_LENGTH)
|
|
226
224
|
|
|
227
|
-
def __clean_duplicates(self, silent_mode: bool = False):
|
|
228
|
-
"""Clean DataSet from full duplicates."""
|
|
229
|
-
# self.logger.info("Clean full duplicates")
|
|
230
|
-
nrows = len(self.data)
|
|
231
|
-
if nrows == 0:
|
|
232
|
-
return
|
|
233
|
-
# Remove absolute duplicates (exclude system_record_id)
|
|
234
|
-
unique_columns = self.data.columns.tolist()
|
|
235
|
-
unique_columns.remove(SYSTEM_RECORD_ID)
|
|
236
|
-
self.logger.info(f"Dataset shape before clean duplicates: {self.data.shape}")
|
|
237
|
-
self.data.drop_duplicates(subset=unique_columns, inplace=True)
|
|
238
|
-
self.logger.info(f"Dataset shape after clean duplicates: {self.data.shape}")
|
|
239
|
-
nrows_after_full_dedup = len(self.data)
|
|
240
|
-
share_full_dedup = 100 * (1 - nrows_after_full_dedup / nrows)
|
|
241
|
-
if share_full_dedup > 0:
|
|
242
|
-
msg = self.bundle.get("dataset_full_duplicates").format(share_full_dedup)
|
|
243
|
-
self.logger.warning(msg)
|
|
244
|
-
# if not silent_mode:
|
|
245
|
-
# print(msg)
|
|
246
|
-
# self.warning_counter.increment()
|
|
247
|
-
target_column = self.etalon_def_checked.get(FileColumnMeaningType.TARGET.value)
|
|
248
|
-
if target_column is not None:
|
|
249
|
-
unique_columns.remove(target_column)
|
|
250
|
-
marked_duplicates = self.data.duplicated(subset=unique_columns, keep=False)
|
|
251
|
-
if marked_duplicates.sum() > 0:
|
|
252
|
-
dups_indices = self.data[marked_duplicates].index.to_list()
|
|
253
|
-
nrows_after_tgt_dedup = len(self.data.drop_duplicates(subset=unique_columns))
|
|
254
|
-
num_dup_rows = nrows_after_full_dedup - nrows_after_tgt_dedup
|
|
255
|
-
share_tgt_dedup = 100 * num_dup_rows / nrows_after_full_dedup
|
|
256
|
-
|
|
257
|
-
msg = self.bundle.get("dataset_diff_target_duplicates").format(
|
|
258
|
-
share_tgt_dedup, num_dup_rows, dups_indices
|
|
259
|
-
)
|
|
260
|
-
self.logger.warning(msg)
|
|
261
|
-
if not silent_mode:
|
|
262
|
-
print(msg)
|
|
263
|
-
self.data.drop_duplicates(subset=unique_columns, keep=False, inplace=True)
|
|
264
|
-
self.logger.info(f"Dataset shape after clean invalid target duplicates: {self.data.shape}")
|
|
265
|
-
|
|
266
225
|
def __convert_bools(self):
|
|
267
226
|
"""Convert bool columns True -> 1, False -> 0"""
|
|
268
227
|
# self.logger.info("Converting bool to int")
|
|
@@ -280,12 +239,10 @@ class Dataset: # (pd.DataFrame):
|
|
|
280
239
|
def __correct_decimal_comma(self):
|
|
281
240
|
"""Check DataSet for decimal commas and fix them"""
|
|
282
241
|
# self.logger.info("Correct decimal commas")
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
for col in cls_to_check:
|
|
288
|
-
if tmp[col].astype("string").str.match("^[0-9]+,[0-9]*$").any():
|
|
242
|
+
columns_to_fix = find_numbers_with_decimal_comma(self.data)
|
|
243
|
+
if len(columns_to_fix) > 0:
|
|
244
|
+
self.logger.warning(f"Convert strings with decimal comma to float: {columns_to_fix}")
|
|
245
|
+
for col in columns_to_fix:
|
|
289
246
|
self.data[col] = self.data[col].astype("string").str.replace(",", ".").astype(np.float64)
|
|
290
247
|
|
|
291
248
|
@staticmethod
|
|
@@ -504,8 +461,10 @@ class Dataset: # (pd.DataFrame):
|
|
|
504
461
|
self.task_type == ModelTaskType.BINARY and len(train_segment) > self.MIN_SAMPLE_THRESHOLD
|
|
505
462
|
):
|
|
506
463
|
count = len(train_segment)
|
|
507
|
-
|
|
508
|
-
|
|
464
|
+
min_class_count = count
|
|
465
|
+
min_class_value = None
|
|
466
|
+
target_column = self.etalon_def_checked.get(FileColumnMeaningType.TARGET.value, "")
|
|
467
|
+
target = train_segment[target_column].copy()
|
|
509
468
|
target_classes_count = target.nunique()
|
|
510
469
|
|
|
511
470
|
if target_classes_count > self.MAX_MULTICLASS_CLASS_COUNT:
|
|
@@ -515,9 +474,12 @@ class Dataset: # (pd.DataFrame):
|
|
|
515
474
|
self.logger.warning(msg)
|
|
516
475
|
raise ValidationError(msg)
|
|
517
476
|
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
|
|
477
|
+
unique_target = target.unique()
|
|
478
|
+
for v in list(unique_target): # type: ignore
|
|
479
|
+
current_class_count = len(train_segment.loc[target == v])
|
|
480
|
+
if current_class_count < min_class_count:
|
|
481
|
+
min_class_count = current_class_count
|
|
482
|
+
min_class_value = v
|
|
521
483
|
|
|
522
484
|
if min_class_count < self.MIN_TARGET_CLASS_ROWS:
|
|
523
485
|
msg = self.bundle.get("dataset_rarest_class_less_min").format(
|
|
@@ -530,19 +492,53 @@ class Dataset: # (pd.DataFrame):
|
|
|
530
492
|
min_class_threshold = min_class_percent * count
|
|
531
493
|
|
|
532
494
|
if min_class_count < min_class_threshold:
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
df=train_segment,
|
|
536
|
-
target_column=target_column,
|
|
537
|
-
task_type=self.task_type,
|
|
538
|
-
random_state=self.random_state,
|
|
539
|
-
imbalance_threshold=self.IMBALANCE_THESHOLD,
|
|
540
|
-
binary_bootstrap_loops=self.BINARY_BOOTSTRAP_LOOPS,
|
|
541
|
-
multiclass_bootstrap_loops=self.MULTICLASS_BOOTSTRAP_LOOPS,
|
|
542
|
-
logger=self.logger,
|
|
543
|
-
bundle=self.bundle,
|
|
544
|
-
warning_counter=self.warning_counter,
|
|
495
|
+
msg = self.bundle.get("dataset_rarest_class_less_threshold").format(
|
|
496
|
+
min_class_value, min_class_count, min_class_threshold, min_class_percent * 100
|
|
545
497
|
)
|
|
498
|
+
self.logger.warning(msg)
|
|
499
|
+
print(msg)
|
|
500
|
+
self.warning_counter.increment()
|
|
501
|
+
|
|
502
|
+
train_segment = train_segment.copy().sort_values(by=SYSTEM_RECORD_ID)
|
|
503
|
+
if self.task_type == ModelTaskType.MULTICLASS:
|
|
504
|
+
# Sort classes by rows count and find 25% quantile class
|
|
505
|
+
classes = target.value_counts().index
|
|
506
|
+
quantile25_idx = int(0.75 * len(classes))
|
|
507
|
+
quantile25_class = classes[quantile25_idx]
|
|
508
|
+
count_of_quantile25_class = len(target[target == quantile25_class])
|
|
509
|
+
msg = self.bundle.get("imbalance_multiclass").format(quantile25_class, count_of_quantile25_class)
|
|
510
|
+
self.logger.warning(msg)
|
|
511
|
+
print(msg)
|
|
512
|
+
# 25% and lower classes will stay as is. Higher classes will be downsampled
|
|
513
|
+
parts = []
|
|
514
|
+
for class_idx in range(quantile25_idx):
|
|
515
|
+
sampled = train_segment[train_segment[target_column] == classes[class_idx]].sample(
|
|
516
|
+
n=count_of_quantile25_class, random_state=self.random_state
|
|
517
|
+
)
|
|
518
|
+
parts.append(sampled)
|
|
519
|
+
for class_idx in range(quantile25_idx, len(classes)):
|
|
520
|
+
parts.append(train_segment[train_segment[target_column] == classes[class_idx]])
|
|
521
|
+
resampled_data = pd.concat(parts)
|
|
522
|
+
elif self.task_type == ModelTaskType.BINARY and min_class_count < self.MIN_SAMPLE_THRESHOLD / 2:
|
|
523
|
+
minority_class = train_segment[train_segment[target_column] == min_class_value]
|
|
524
|
+
majority_class = train_segment[train_segment[target_column] != min_class_value]
|
|
525
|
+
sampled_majority_class = majority_class.sample(
|
|
526
|
+
n=self.MIN_SAMPLE_THRESHOLD - min_class_count, random_state=self.random_state
|
|
527
|
+
)
|
|
528
|
+
resampled_data = train_segment[
|
|
529
|
+
(train_segment[SYSTEM_RECORD_ID].isin(minority_class[SYSTEM_RECORD_ID]))
|
|
530
|
+
| (train_segment[SYSTEM_RECORD_ID].isin(sampled_majority_class[SYSTEM_RECORD_ID]))
|
|
531
|
+
]
|
|
532
|
+
else:
|
|
533
|
+
sampler = RandomUnderSampler(random_state=self.random_state)
|
|
534
|
+
X = train_segment[SYSTEM_RECORD_ID]
|
|
535
|
+
X = X.to_frame(SYSTEM_RECORD_ID)
|
|
536
|
+
new_x, _ = sampler.fit_resample(X, target) # type: ignore
|
|
537
|
+
resampled_data = train_segment[train_segment[SYSTEM_RECORD_ID].isin(new_x[SYSTEM_RECORD_ID])]
|
|
538
|
+
|
|
539
|
+
self.data = resampled_data
|
|
540
|
+
self.logger.info(f"Shape after rebalance resampling: {self.data.shape}")
|
|
541
|
+
self.imbalanced = True
|
|
546
542
|
|
|
547
543
|
# Resample over fit threshold
|
|
548
544
|
if not self.imbalanced and EVAL_SET_INDEX in self.data.columns:
|
upgini/features_enricher.py
CHANGED
|
@@ -16,7 +16,13 @@ from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union
|
|
|
16
16
|
|
|
17
17
|
import numpy as np
|
|
18
18
|
import pandas as pd
|
|
19
|
-
from pandas.api.types import
|
|
19
|
+
from pandas.api.types import (
|
|
20
|
+
is_bool,
|
|
21
|
+
is_datetime64_any_dtype,
|
|
22
|
+
is_numeric_dtype,
|
|
23
|
+
is_period_dtype,
|
|
24
|
+
is_string_dtype,
|
|
25
|
+
)
|
|
20
26
|
from scipy.stats import ks_2samp
|
|
21
27
|
from sklearn.base import TransformerMixin
|
|
22
28
|
from sklearn.exceptions import NotFittedError
|
|
@@ -54,7 +60,7 @@ from upgini.metrics import EstimatorWrapper, validate_scoring_argument
|
|
|
54
60
|
from upgini.resource_bundle import ResourceBundle, bundle, get_custom_bundle
|
|
55
61
|
from upgini.search_task import SearchTask
|
|
56
62
|
from upgini.spinner import Spinner
|
|
57
|
-
from upgini.utils import combine_search_keys
|
|
63
|
+
from upgini.utils import combine_search_keys, find_numbers_with_decimal_comma
|
|
58
64
|
from upgini.utils.country_utils import CountrySearchKeyDetector
|
|
59
65
|
from upgini.utils.custom_loss_utils import (
|
|
60
66
|
get_additional_params_custom_loss,
|
|
@@ -215,7 +221,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
215
221
|
if logs_enabled:
|
|
216
222
|
self.logger = LoggerFactory().get_logger(endpoint, self._api_key, client_ip, client_visitorid)
|
|
217
223
|
else:
|
|
218
|
-
self.logger = logging.getLogger(
|
|
224
|
+
self.logger = logging.getLogger()
|
|
219
225
|
self.logger.setLevel("FATAL")
|
|
220
226
|
|
|
221
227
|
if len(kwargs) > 0:
|
|
@@ -1323,16 +1329,52 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1323
1329
|
fitting_X = X_sorted[client_features].copy()
|
|
1324
1330
|
fitting_enriched_X = enriched_X_sorted[client_features + existing_filtered_enriched_features].copy()
|
|
1325
1331
|
|
|
1326
|
-
#
|
|
1327
|
-
|
|
1328
|
-
columns_with_high_cardinality =
|
|
1329
|
-
|
|
1332
|
+
# Don't do this because one hot encoded client features will be removed
|
|
1333
|
+
# # Detect and drop high cardinality columns in train
|
|
1334
|
+
# columns_with_high_cardinality = FeaturesValidator.find_high_cardinality(fitting_X)
|
|
1335
|
+
# columns_with_high_cardinality = [
|
|
1336
|
+
# c for c in columns_with_high_cardinality if c not in (self.generate_features or [])
|
|
1337
|
+
# ]
|
|
1338
|
+
# if len(columns_with_high_cardinality) > 0:
|
|
1339
|
+
# self.logger.warning(
|
|
1340
|
+
# f"High cardinality columns {columns_with_high_cardinality} will be dropped for metrics calculation"
|
|
1341
|
+
# )
|
|
1342
|
+
# fitting_X = fitting_X.drop(columns=columns_with_high_cardinality, errors="ignore")
|
|
1343
|
+
# fitting_enriched_X = fitting_enriched_X.drop(columns=columns_with_high_cardinality, errors="ignore")
|
|
1344
|
+
|
|
1345
|
+
# Detect and drop constant columns
|
|
1346
|
+
constant_columns = FeaturesValidator.find_constant_features(fitting_X)
|
|
1347
|
+
if len(constant_columns) > 0:
|
|
1348
|
+
self.logger.warning(f"Constant columns {constant_columns} will be dropped for metrics calculation")
|
|
1349
|
+
fitting_X = fitting_X.drop(columns=constant_columns, errors="ignore")
|
|
1350
|
+
fitting_enriched_X = fitting_enriched_X.drop(columns=constant_columns, errors="ignore")
|
|
1351
|
+
|
|
1352
|
+
# Remove datetime features
|
|
1353
|
+
datetime_features = [
|
|
1354
|
+
f for f in fitting_X.columns if is_datetime64_any_dtype(fitting_X[f]) or is_period_dtype(fitting_X[f])
|
|
1330
1355
|
]
|
|
1331
|
-
|
|
1332
|
-
|
|
1333
|
-
|
|
1334
|
-
|
|
1335
|
-
|
|
1356
|
+
if len(datetime_features) > 0:
|
|
1357
|
+
self.logger.warning(self.bundle.get("dataset_date_features").format(datetime_features))
|
|
1358
|
+
fitting_X = fitting_X.drop(columns=datetime_features, errors="ignore")
|
|
1359
|
+
fitting_enriched_X = fitting_enriched_X.drop(columns=datetime_features, errors="ignore")
|
|
1360
|
+
|
|
1361
|
+
bool_columns = []
|
|
1362
|
+
for col in fitting_X.columns:
|
|
1363
|
+
if is_bool(fitting_X[col]):
|
|
1364
|
+
bool_columns.append(col)
|
|
1365
|
+
fitting_X[col] = fitting_X[col].astype(str)
|
|
1366
|
+
fitting_enriched_X[col] = fitting_enriched_X[col].astype(str)
|
|
1367
|
+
if len(bool_columns) > 0:
|
|
1368
|
+
self.logger.warning(f"Bool columns {bool_columns} was converted to string for metrics calculation")
|
|
1369
|
+
|
|
1370
|
+
decimal_columns_to_fix = find_numbers_with_decimal_comma(fitting_X)
|
|
1371
|
+
if len(decimal_columns_to_fix) > 0:
|
|
1372
|
+
self.logger.warning(f"Convert strings with decimal comma to float: {decimal_columns_to_fix}")
|
|
1373
|
+
for col in decimal_columns_to_fix:
|
|
1374
|
+
fitting_X[col] = fitting_X[col].astype("string").str.replace(",", ".").astype(np.float64)
|
|
1375
|
+
fitting_enriched_X[col] = (
|
|
1376
|
+
fitting_enriched_X[col].astype("string").str.replace(",", ".").astype(np.float64)
|
|
1377
|
+
)
|
|
1336
1378
|
|
|
1337
1379
|
fitting_eval_set_dict = dict()
|
|
1338
1380
|
for idx, eval_tuple in eval_set_sampled_dict.items():
|
|
@@ -1346,11 +1388,31 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1346
1388
|
client_features + existing_filtered_enriched_features
|
|
1347
1389
|
].copy()
|
|
1348
1390
|
|
|
1349
|
-
# Drop high cardinality
|
|
1350
|
-
|
|
1351
|
-
|
|
1352
|
-
|
|
1353
|
-
|
|
1391
|
+
# # Drop high cardinality features in eval set
|
|
1392
|
+
# if len(columns_with_high_cardinality) > 0:
|
|
1393
|
+
# fitting_eval_X = fitting_eval_X.drop(columns=columns_with_high_cardinality, errors="ignore")
|
|
1394
|
+
# fitting_enriched_eval_X = fitting_enriched_eval_X.drop(
|
|
1395
|
+
# columns=columns_with_high_cardinality, errors="ignore"
|
|
1396
|
+
# )
|
|
1397
|
+
# Drop constant features in eval_set
|
|
1398
|
+
if len(constant_columns) > 0:
|
|
1399
|
+
fitting_eval_X = fitting_eval_X.drop(columns=constant_columns, errors="ignore")
|
|
1400
|
+
fitting_enriched_eval_X = fitting_enriched_eval_X.drop(columns=constant_columns, errors="ignore")
|
|
1401
|
+
# Drop datetime features in eval_set
|
|
1402
|
+
if len(datetime_features) > 0:
|
|
1403
|
+
fitting_eval_X = fitting_eval_X.drop(columns=datetime_features, errors="ignore")
|
|
1404
|
+
fitting_enriched_eval_X = fitting_enriched_eval_X.drop(columns=datetime_features, errors="ignore")
|
|
1405
|
+
# Convert bool to string in eval_set
|
|
1406
|
+
if len(bool_columns) > 0:
|
|
1407
|
+
fitting_eval_X[col] = fitting_eval_X[col].astype(str)
|
|
1408
|
+
fitting_enriched_eval_X[col] = fitting_enriched_eval_X[col].astype(str)
|
|
1409
|
+
# Correct string features with decimal commas
|
|
1410
|
+
if len(decimal_columns_to_fix) > 0:
|
|
1411
|
+
for col in decimal_columns_to_fix:
|
|
1412
|
+
fitting_eval_X[col] = fitting_eval_X[col].astype("string").str.replace(",", ".").astype(np.float64)
|
|
1413
|
+
fitting_enriched_eval_X[col] = (
|
|
1414
|
+
fitting_enriched_eval_X[col].astype("string").str.replace(",", ".").astype(np.float64)
|
|
1415
|
+
)
|
|
1354
1416
|
|
|
1355
1417
|
fitting_eval_set_dict[idx] = (
|
|
1356
1418
|
fitting_eval_X,
|
|
@@ -1398,6 +1460,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1398
1460
|
elif len(self.feature_importances_) == 0:
|
|
1399
1461
|
self.logger.info("No external features selected. So use only input datasets for metrics calculation")
|
|
1400
1462
|
return self.__sample_only_input(validated_X, validated_y, eval_set, is_demo_dataset)
|
|
1463
|
+
# TODO save and check if dataset was deduplicated - use imbalance branch for such case
|
|
1401
1464
|
elif not self.imbalanced and not exclude_features_sources and is_input_same_as_fit:
|
|
1402
1465
|
self.logger.info("Dataset is not imbalanced, so use enriched_X from fit")
|
|
1403
1466
|
return self.__sample_balanced(eval_set, trace_id, remove_outliers_calc_metrics)
|
|
@@ -1438,6 +1501,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1438
1501
|
eval_xy[EVAL_SET_INDEX] = idx + 1
|
|
1439
1502
|
df = pd.concat([df, eval_xy])
|
|
1440
1503
|
|
|
1504
|
+
df = clean_full_duplicates(df, logger=self.logger, silent=True, bundle=self.bundle)
|
|
1505
|
+
|
|
1441
1506
|
num_samples = _num_samples(df)
|
|
1442
1507
|
sample_threshold, sample_rows = (
|
|
1443
1508
|
(Dataset.FIT_SAMPLE_WITH_EVAL_SET_THRESHOLD, Dataset.FIT_SAMPLE_WITH_EVAL_SET_ROWS)
|
|
@@ -1561,14 +1626,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1561
1626
|
eval_df_with_index[EVAL_SET_INDEX] = idx + 1
|
|
1562
1627
|
df = pd.concat([df, eval_df_with_index])
|
|
1563
1628
|
|
|
1564
|
-
|
|
1565
|
-
df,
|
|
1566
|
-
self.search_keys,
|
|
1567
|
-
date_format=self.date_format,
|
|
1568
|
-
logger=self.logger,
|
|
1569
|
-
silent=True,
|
|
1570
|
-
bundle=self.bundle,
|
|
1571
|
-
)
|
|
1629
|
+
df = clean_full_duplicates(df, logger=self.logger, silent=True, bundle=self.bundle)
|
|
1572
1630
|
|
|
1573
1631
|
# downsample if need to eval_set threshold
|
|
1574
1632
|
num_samples = _num_samples(df)
|
|
@@ -1653,9 +1711,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1653
1711
|
|
|
1654
1712
|
self.__cached_sampled_datasets = (X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, self.search_keys)
|
|
1655
1713
|
|
|
1656
|
-
return self.__mk_sampled_data_tuple(
|
|
1657
|
-
X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, self.search_keys
|
|
1658
|
-
)
|
|
1714
|
+
return self.__mk_sampled_data_tuple(X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, self.search_keys)
|
|
1659
1715
|
|
|
1660
1716
|
def __mk_sampled_data_tuple(
|
|
1661
1717
|
self,
|
|
@@ -2179,11 +2235,10 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2179
2235
|
|
|
2180
2236
|
df = self.__add_country_code(df, self.fit_search_keys)
|
|
2181
2237
|
|
|
2182
|
-
|
|
2238
|
+
df = remove_fintech_duplicates(
|
|
2183
2239
|
df, self.fit_search_keys, date_format=self.date_format, logger=self.logger, bundle=self.bundle
|
|
2184
2240
|
)
|
|
2185
|
-
|
|
2186
|
-
df = clean_full_duplicates(df, self.logger, bundle=self.bundle)
|
|
2241
|
+
df = clean_full_duplicates(df, self.logger, bundle=self.bundle)
|
|
2187
2242
|
|
|
2188
2243
|
date_column = self._get_date_column(self.fit_search_keys)
|
|
2189
2244
|
self.__adjust_cv(df, date_column, model_task_type)
|
|
@@ -2806,8 +2861,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2806
2861
|
# save original order or rows
|
|
2807
2862
|
original_index_name = df.index.name
|
|
2808
2863
|
index_name = df.index.name or DEFAULT_INDEX
|
|
2809
|
-
|
|
2810
|
-
df = df.rename(columns={index_name: ORIGINAL_INDEX})
|
|
2864
|
+
original_order_name = "original_order"
|
|
2865
|
+
df = df.reset_index().rename(columns={index_name: ORIGINAL_INDEX})
|
|
2866
|
+
df = df.reset_index().rename(columns={DEFAULT_INDEX: original_order_name})
|
|
2811
2867
|
|
|
2812
2868
|
# order by date and idempotent order by other keys
|
|
2813
2869
|
if self.cv not in [CVType.time_series, CVType.blocked_time_series]:
|
|
@@ -2847,7 +2903,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2847
2903
|
# return original order
|
|
2848
2904
|
df = df.set_index(ORIGINAL_INDEX)
|
|
2849
2905
|
df.index.name = original_index_name
|
|
2850
|
-
|
|
2906
|
+
df = df.sort_values(by=original_order_name).drop(columns=original_order_name)
|
|
2851
2907
|
|
|
2852
2908
|
meaning_types[SYSTEM_RECORD_ID] = FileColumnMeaningType.SYSTEM_RECORD_ID
|
|
2853
2909
|
return df
|
|
@@ -2966,6 +3022,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2966
3022
|
return result_train, result_eval_sets
|
|
2967
3023
|
|
|
2968
3024
|
def __prepare_feature_importances(self, trace_id: str, x_columns: List[str], silent=False):
|
|
3025
|
+
llm_source = "LLM with external data augmentation"
|
|
2969
3026
|
if self._search_task is None:
|
|
2970
3027
|
raise NotFittedError(self.bundle.get("transform_unfitted_enricher"))
|
|
2971
3028
|
features_meta = self._search_task.get_all_features_metadata_v2()
|
|
@@ -2990,6 +3047,20 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2990
3047
|
def list_or_single(lst: List[str], single: str):
|
|
2991
3048
|
return lst or ([single] if single else [])
|
|
2992
3049
|
|
|
3050
|
+
def to_anchor(link: str, value: str) -> str:
|
|
3051
|
+
if not value:
|
|
3052
|
+
return ""
|
|
3053
|
+
elif not link:
|
|
3054
|
+
return value
|
|
3055
|
+
elif value == llm_source:
|
|
3056
|
+
return value
|
|
3057
|
+
else:
|
|
3058
|
+
return f"<a href='{link}' target='_blank' rel='noopener noreferrer'>{value}</a>"
|
|
3059
|
+
|
|
3060
|
+
def make_links(names: List[str], links: List[str]):
|
|
3061
|
+
all_links = [to_anchor(link, name) for name, link in itertools.zip_longest(names, links)]
|
|
3062
|
+
return ",".join(all_links)
|
|
3063
|
+
|
|
2993
3064
|
features_meta.sort(key=lambda m: (-m.shap_value, m.name))
|
|
2994
3065
|
for feature_meta in features_meta:
|
|
2995
3066
|
if feature_meta.name in original_names_dict.keys():
|
|
@@ -3015,18 +3086,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3015
3086
|
if len(feature_sample) > 30:
|
|
3016
3087
|
feature_sample = feature_sample[:30] + "..."
|
|
3017
3088
|
|
|
3018
|
-
def to_anchor(link: str, value: str) -> str:
|
|
3019
|
-
if not value:
|
|
3020
|
-
return ""
|
|
3021
|
-
elif not link:
|
|
3022
|
-
return value
|
|
3023
|
-
else:
|
|
3024
|
-
return f"<a href='{link}' target='_blank' rel='noopener noreferrer'>{value}</a>"
|
|
3025
|
-
|
|
3026
|
-
def make_links(names: List[str], links: List[str]):
|
|
3027
|
-
all_links = [to_anchor(link, name) for name, link in itertools.zip_longest(names, links)]
|
|
3028
|
-
return ",".join(all_links)
|
|
3029
|
-
|
|
3030
3089
|
internal_provider = feature_meta.data_provider or "Upgini"
|
|
3031
3090
|
providers = list_or_single(feature_meta.data_providers, feature_meta.data_provider)
|
|
3032
3091
|
provider_links = list_or_single(feature_meta.data_provider_links, feature_meta.data_provider_link)
|
|
@@ -3036,7 +3095,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3036
3095
|
provider = to_anchor("https://upgini.com", "Upgini")
|
|
3037
3096
|
|
|
3038
3097
|
internal_source = feature_meta.data_source or (
|
|
3039
|
-
|
|
3098
|
+
llm_source
|
|
3040
3099
|
if not feature_meta.name.endswith("_country") and not feature_meta.name.endswith("_postal_code")
|
|
3041
3100
|
else ""
|
|
3042
3101
|
)
|
upgini/search_task.py
CHANGED
|
@@ -57,7 +57,7 @@ class SearchTask:
|
|
|
57
57
|
if logger is not None:
|
|
58
58
|
self.logger = logger
|
|
59
59
|
else:
|
|
60
|
-
self.logger = logging.getLogger(
|
|
60
|
+
self.logger = logging.getLogger()
|
|
61
61
|
self.logger.setLevel("FATAL")
|
|
62
62
|
self.provider_metadata_v2: Optional[List[ProviderTaskMetadataV2]] = None
|
|
63
63
|
self.unused_features_for_generation: Optional[List[str]] = None
|
upgini/utils/__init__.py
CHANGED
|
@@ -1,6 +1,9 @@
|
|
|
1
1
|
import itertools
|
|
2
2
|
from typing import List, Tuple
|
|
3
3
|
|
|
4
|
+
import pandas as pd
|
|
5
|
+
from pandas.api.types import is_string_dtype
|
|
6
|
+
|
|
4
7
|
|
|
5
8
|
def combine_search_keys(search_keys: List[str]) -> List[Tuple[str]]:
|
|
6
9
|
combined_search_keys = []
|
|
@@ -8,3 +11,14 @@ def combine_search_keys(search_keys: List[str]) -> List[Tuple[str]]:
|
|
|
8
11
|
for subset in itertools.combinations(search_keys, L):
|
|
9
12
|
combined_search_keys.append(subset)
|
|
10
13
|
return combined_search_keys
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def find_numbers_with_decimal_comma(df: pd.DataFrame) -> pd.DataFrame:
|
|
17
|
+
tmp = df.head(10)
|
|
18
|
+
# all columns with sep="," will have dtype == 'object', i.e string
|
|
19
|
+
# sep="." will be casted to numeric automatically
|
|
20
|
+
return [
|
|
21
|
+
col
|
|
22
|
+
for col in tmp.columns
|
|
23
|
+
if is_string_dtype(tmp[col]) and tmp[col].astype("string").str.match("^[0-9]+,[0-9]*$").any()
|
|
24
|
+
]
|
upgini/utils/datetime_utils.py
CHANGED
|
@@ -31,7 +31,7 @@ class DateTimeSearchKeyConverter:
|
|
|
31
31
|
if logger is not None:
|
|
32
32
|
self.logger = logger
|
|
33
33
|
else:
|
|
34
|
-
self.logger = logging.getLogger(
|
|
34
|
+
self.logger = logging.getLogger()
|
|
35
35
|
self.logger.setLevel("FATAL")
|
|
36
36
|
self.generated_features: List[str] = []
|
|
37
37
|
self.bundle = bundle or get_custom_bundle()
|
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
from logging import Logger
|
|
2
|
-
from typing import Dict, List, Optional,
|
|
2
|
+
from typing import Dict, List, Optional, Union
|
|
3
3
|
|
|
4
4
|
import pandas as pd
|
|
5
5
|
|
|
6
|
-
from upgini.metadata import SYSTEM_RECORD_ID, TARGET, ModelTaskType, SearchKey
|
|
6
|
+
from upgini.metadata import SORT_ID, SYSTEM_RECORD_ID, TARGET, ModelTaskType, SearchKey
|
|
7
7
|
from upgini.resource_bundle import ResourceBundle
|
|
8
8
|
from upgini.utils.datetime_utils import DateTimeSearchKeyConverter
|
|
9
9
|
from upgini.utils.target_utils import define_task
|
|
@@ -16,17 +16,15 @@ def remove_fintech_duplicates(
|
|
|
16
16
|
logger: Optional[Logger] = None,
|
|
17
17
|
silent=False,
|
|
18
18
|
bundle: ResourceBundle = None,
|
|
19
|
-
) ->
|
|
19
|
+
) -> pd.DataFrame:
|
|
20
20
|
# Base checks
|
|
21
|
-
need_full_deduplication = True
|
|
22
|
-
|
|
23
21
|
date_col = _get_column_by_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
|
|
24
22
|
if define_task(df[TARGET], date_col is not None, silent=True) != ModelTaskType.BINARY:
|
|
25
|
-
return
|
|
23
|
+
return df
|
|
26
24
|
|
|
27
25
|
date_col = _get_column_by_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
|
|
28
26
|
if date_col is None:
|
|
29
|
-
return
|
|
27
|
+
return df
|
|
30
28
|
|
|
31
29
|
personal_cols = []
|
|
32
30
|
phone_col = _get_column_by_key(search_keys, SearchKey.PHONE)
|
|
@@ -39,13 +37,13 @@ def remove_fintech_duplicates(
|
|
|
39
37
|
if hem_col:
|
|
40
38
|
personal_cols.append(hem_col)
|
|
41
39
|
if len(personal_cols) == 0:
|
|
42
|
-
return
|
|
40
|
+
return df
|
|
43
41
|
|
|
44
42
|
sub_df = df[personal_cols + [date_col, TARGET]]
|
|
45
43
|
|
|
46
44
|
# Fast check for duplicates by personal keys
|
|
47
45
|
if not sub_df[personal_cols].duplicated().any():
|
|
48
|
-
return
|
|
46
|
+
return df
|
|
49
47
|
|
|
50
48
|
grouped_by_personal_cols = sub_df.groupby(personal_cols, group_keys=False)
|
|
51
49
|
|
|
@@ -54,21 +52,19 @@ def remove_fintech_duplicates(
|
|
|
54
52
|
total = len(uniques)
|
|
55
53
|
diff_dates = len(uniques[uniques > 1])
|
|
56
54
|
if diff_dates / total >= 0.6:
|
|
57
|
-
return
|
|
55
|
+
return df
|
|
58
56
|
|
|
59
57
|
# Additional checks
|
|
60
58
|
|
|
61
|
-
need_full_deduplication = False
|
|
62
|
-
|
|
63
59
|
duplicates = sub_df.duplicated(personal_cols, keep=False)
|
|
64
60
|
duplicate_rows = sub_df[duplicates]
|
|
65
61
|
if len(duplicate_rows) == 0:
|
|
66
|
-
return
|
|
62
|
+
return df
|
|
67
63
|
|
|
68
64
|
# if there is no different target values in personal keys duplicate rows
|
|
69
65
|
nonunique_target_groups = grouped_by_personal_cols[TARGET].nunique() > 1
|
|
70
66
|
if nonunique_target_groups.sum() == 0:
|
|
71
|
-
return
|
|
67
|
+
return df
|
|
72
68
|
|
|
73
69
|
def has_diff_target_within_60_days(rows):
|
|
74
70
|
rows = rows.sort_values(by=date_col)
|
|
@@ -96,7 +92,7 @@ def remove_fintech_duplicates(
|
|
|
96
92
|
df = df[~df.index.isin(rows_to_remove.index)]
|
|
97
93
|
logger.info(f"Dataset shape after clean fintech duplicates: {df.shape}")
|
|
98
94
|
|
|
99
|
-
return
|
|
95
|
+
return df
|
|
100
96
|
|
|
101
97
|
|
|
102
98
|
def clean_full_duplicates(
|
|
@@ -109,8 +105,8 @@ def clean_full_duplicates(
|
|
|
109
105
|
unique_columns = df.columns.tolist()
|
|
110
106
|
if SYSTEM_RECORD_ID in unique_columns:
|
|
111
107
|
unique_columns.remove(SYSTEM_RECORD_ID)
|
|
112
|
-
if
|
|
113
|
-
unique_columns.remove(
|
|
108
|
+
if SORT_ID in unique_columns:
|
|
109
|
+
unique_columns.remove(SORT_ID)
|
|
114
110
|
logger.info(f"Dataset shape before clean duplicates: {df.shape}")
|
|
115
111
|
df = df.drop_duplicates(subset=unique_columns)
|
|
116
112
|
logger.info(f"Dataset shape after clean duplicates: {df.shape}")
|
|
@@ -55,7 +55,7 @@ class FeaturesValidator:
|
|
|
55
55
|
return empty_or_constant_features
|
|
56
56
|
|
|
57
57
|
@staticmethod
|
|
58
|
-
def find_high_cardinality(df: pd.DataFrame):
|
|
58
|
+
def find_high_cardinality(df: pd.DataFrame) -> List[str]:
|
|
59
59
|
# Remove high cardinality columns
|
|
60
60
|
row_count = df.shape[0]
|
|
61
61
|
return [
|
|
@@ -63,3 +63,11 @@ class FeaturesValidator:
|
|
|
63
63
|
for i in df
|
|
64
64
|
if (is_string_dtype(df[i]) or is_integer_dtype(df[i])) and (df[i].nunique() / row_count >= 0.9)
|
|
65
65
|
]
|
|
66
|
+
|
|
67
|
+
@staticmethod
|
|
68
|
+
def find_constant_features(df: pd.DataFrame) -> List[str]:
|
|
69
|
+
return [
|
|
70
|
+
i
|
|
71
|
+
for i in df
|
|
72
|
+
if df[i].nunique() == 1
|
|
73
|
+
]
|
upgini/utils/target_utils.py
CHANGED
|
@@ -6,10 +6,8 @@ import pandas as pd
|
|
|
6
6
|
from pandas.api.types import is_numeric_dtype
|
|
7
7
|
|
|
8
8
|
from upgini.errors import ValidationError
|
|
9
|
-
from upgini.metadata import
|
|
10
|
-
from upgini.resource_bundle import
|
|
11
|
-
from upgini.sampler.random_under_sampler import RandomUnderSampler
|
|
12
|
-
from upgini.utils.warning_counter import WarningCounter
|
|
9
|
+
from upgini.metadata import ModelTaskType
|
|
10
|
+
from upgini.resource_bundle import bundle
|
|
13
11
|
|
|
14
12
|
|
|
15
13
|
def correct_string_target(y: Union[pd.Series, np.ndarray]) -> Union[pd.Series, np.ndarray]:
|
|
@@ -74,110 +72,3 @@ def is_int_encoding(unique_values):
|
|
|
74
72
|
return set(unique_values) == set(range(len(unique_values))) or set(unique_values) == set(
|
|
75
73
|
range(1, len(unique_values) + 1)
|
|
76
74
|
)
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
def balance_undersample(
|
|
80
|
-
df: pd.DataFrame,
|
|
81
|
-
target_column: str,
|
|
82
|
-
task_type: ModelTaskType,
|
|
83
|
-
random_state: int,
|
|
84
|
-
imbalance_threshold: int = 0.2,
|
|
85
|
-
min_sample_threshold: int = 5000,
|
|
86
|
-
binary_bootstrap_loops: int = 5,
|
|
87
|
-
multiclass_bootstrap_loops: int = 2,
|
|
88
|
-
logger: Optional[logging.Logger] = None,
|
|
89
|
-
bundle: Optional[ResourceBundle] = None,
|
|
90
|
-
warning_counter: Optional[WarningCounter] = None,
|
|
91
|
-
) -> pd.DataFrame:
|
|
92
|
-
if logger is None:
|
|
93
|
-
logger = logging.getLogger("muted_logger")
|
|
94
|
-
logger.setLevel("FATAL")
|
|
95
|
-
bundle = bundle or get_custom_bundle()
|
|
96
|
-
if SYSTEM_RECORD_ID not in df.columns:
|
|
97
|
-
raise Exception("System record id must be presented for undersampling")
|
|
98
|
-
|
|
99
|
-
count = len(df)
|
|
100
|
-
target = df[target_column].copy()
|
|
101
|
-
target_classes_count = target.nunique()
|
|
102
|
-
|
|
103
|
-
vc = target.value_counts()
|
|
104
|
-
max_class_value = vc.index[0]
|
|
105
|
-
min_class_value = vc.index[len(vc) - 1]
|
|
106
|
-
max_class_count = vc[max_class_value]
|
|
107
|
-
min_class_count = vc[min_class_value]
|
|
108
|
-
|
|
109
|
-
min_class_percent = imbalance_threshold / target_classes_count
|
|
110
|
-
min_class_threshold = min_class_percent * count
|
|
111
|
-
|
|
112
|
-
resampled_data = df
|
|
113
|
-
df = df.copy().sort_values(by=SYSTEM_RECORD_ID)
|
|
114
|
-
if task_type == ModelTaskType.MULTICLASS:
|
|
115
|
-
# Sort classes by rows count and find 25% quantile class
|
|
116
|
-
classes = vc.index
|
|
117
|
-
quantile25_idx = int(0.75 * len(classes)) - 1
|
|
118
|
-
quantile25_class = classes[quantile25_idx]
|
|
119
|
-
quantile25_class_cnt = vc[quantile25_class]
|
|
120
|
-
|
|
121
|
-
if max_class_count > (quantile25_class_cnt * multiclass_bootstrap_loops):
|
|
122
|
-
msg = bundle.get("imbalance_multiclass").format(quantile25_class, quantile25_class_cnt)
|
|
123
|
-
logger.warning(msg)
|
|
124
|
-
print(msg)
|
|
125
|
-
if warning_counter:
|
|
126
|
-
warning_counter.increment()
|
|
127
|
-
|
|
128
|
-
# 25% and lower classes will stay as is. Higher classes will be downsampled
|
|
129
|
-
sample_strategy = dict()
|
|
130
|
-
for class_idx in range(quantile25_idx):
|
|
131
|
-
# compare class count with count_of_quantile25_class * 2
|
|
132
|
-
class_value = classes[class_idx]
|
|
133
|
-
class_count = vc[class_value]
|
|
134
|
-
sample_strategy[class_value] = min(class_count, quantile25_class_cnt * multiclass_bootstrap_loops)
|
|
135
|
-
sampler = RandomUnderSampler(
|
|
136
|
-
sampling_strategy=sample_strategy, random_state=random_state
|
|
137
|
-
)
|
|
138
|
-
X = df[SYSTEM_RECORD_ID]
|
|
139
|
-
X = X.to_frame(SYSTEM_RECORD_ID)
|
|
140
|
-
new_x, _ = sampler.fit_resample(X, target) # type: ignore
|
|
141
|
-
|
|
142
|
-
resampled_data = df[df[SYSTEM_RECORD_ID].isin(new_x[SYSTEM_RECORD_ID])]
|
|
143
|
-
elif len(df) > min_sample_threshold and min_class_count < min_sample_threshold / 2:
|
|
144
|
-
msg = bundle.get("dataset_rarest_class_less_threshold").format(
|
|
145
|
-
min_class_value, min_class_count, min_class_threshold, min_class_percent * 100
|
|
146
|
-
)
|
|
147
|
-
logger.warning(msg)
|
|
148
|
-
print(msg)
|
|
149
|
-
if warning_counter:
|
|
150
|
-
warning_counter.increment()
|
|
151
|
-
|
|
152
|
-
# fill up to min_sample_threshold by majority class
|
|
153
|
-
minority_class = df[df[target_column] == min_class_value]
|
|
154
|
-
majority_class = df[df[target_column] != min_class_value]
|
|
155
|
-
sample_size = min(len(majority_class, min_sample_threshold - min_class_count))
|
|
156
|
-
sampled_majority_class = majority_class.sample(
|
|
157
|
-
n=sample_size, random_state=random_state
|
|
158
|
-
)
|
|
159
|
-
resampled_data = df[
|
|
160
|
-
(df[SYSTEM_RECORD_ID].isin(minority_class[SYSTEM_RECORD_ID]))
|
|
161
|
-
| (df[SYSTEM_RECORD_ID].isin(sampled_majority_class[SYSTEM_RECORD_ID]))
|
|
162
|
-
]
|
|
163
|
-
|
|
164
|
-
elif max_class_count > min_class_count * binary_bootstrap_loops:
|
|
165
|
-
msg = bundle.get("dataset_rarest_class_less_threshold").format(
|
|
166
|
-
min_class_value, min_class_count, min_class_threshold, min_class_percent * 100
|
|
167
|
-
)
|
|
168
|
-
logger.warning(msg)
|
|
169
|
-
print(msg)
|
|
170
|
-
if warning_counter:
|
|
171
|
-
warning_counter.increment()
|
|
172
|
-
|
|
173
|
-
sampler = RandomUnderSampler(
|
|
174
|
-
sampling_strategy={max_class_value: binary_bootstrap_loops * min_class_count}, random_state=random_state
|
|
175
|
-
)
|
|
176
|
-
X = df[SYSTEM_RECORD_ID]
|
|
177
|
-
X = X.to_frame(SYSTEM_RECORD_ID)
|
|
178
|
-
new_x, _ = sampler.fit_resample(X, target) # type: ignore
|
|
179
|
-
|
|
180
|
-
resampled_data = df[df[SYSTEM_RECORD_ID].isin(new_x[SYSTEM_RECORD_ID])]
|
|
181
|
-
|
|
182
|
-
logger.info(f"Shape after rebalance resampling: {resampled_data}")
|
|
183
|
-
return resampled_data
|
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
upgini/__init__.py,sha256=asENHgEVHQBIkV-e_0IhE_ZWqkCG6398U3ZLrNzAH6k,407
|
|
2
2
|
upgini/ads.py,sha256=mre6xn44wcC_fg63iLT_kTh4mViZqR9AKRJZAtpQz8Y,2592
|
|
3
|
-
upgini/dataset.py,sha256=
|
|
3
|
+
upgini/dataset.py,sha256=tLa0aEcT7XwVJz1AawXCIEj3vxsSBi-geKicuYpRIMw,48196
|
|
4
4
|
upgini/errors.py,sha256=pdzQl3MKuK52yvncxMWMRWeSIOGhUFzpQoszoRFBOk0,958
|
|
5
|
-
upgini/features_enricher.py,sha256=
|
|
5
|
+
upgini/features_enricher.py,sha256=dP6Oyhi4erESEGlVFA_j67lqhNqNvbkfM4FGpE_WaTU,171760
|
|
6
6
|
upgini/fingerprint.js,sha256=VygVIQlN1v4NGZfjHqtRogOw8zjTnnMNJg_f7M5iGQU,33442
|
|
7
7
|
upgini/http.py,sha256=eSG4gOpmCGlXmB6KIPNzAG8tRZNUjyYpMeUeHw_2li4,42264
|
|
8
8
|
upgini/metadata.py,sha256=fwVxtkR6Mn4iRoOqV6BfMJvJrx65I3YwZUMbZjhPyOI,9673
|
|
9
9
|
upgini/metrics.py,sha256=LS2MgEKgmn9VEXsKzxv3pBZ-q71mTnpWu6vL8fYgpo4,26727
|
|
10
|
-
upgini/search_task.py,sha256=
|
|
10
|
+
upgini/search_task.py,sha256=5n4qGJmtu48s0-FHAtF3L5qVLMd1JVW3FJlM8dFbh-s,17063
|
|
11
11
|
upgini/spinner.py,sha256=Dm1dQ5F_z_Ua2odLxZX7OypcOX9tSx_vE5MGaKtUmfw,1118
|
|
12
12
|
upgini/version_validator.py,sha256=rDIncP6BEko4J2F2hUcMOtKm_vZbI4ICWcNcw8hrwM4,1400
|
|
13
13
|
upgini/ads_management/__init__.py,sha256=qzyisOToVRP-tquAJD1PblZhNtMrOB8FiyF9JvfkvgE,50
|
|
@@ -21,7 +21,7 @@ upgini/autofe/operand.py,sha256=Rhy7Ky3we-I1Su1--dS4xdsO3K8neV4rqM_Q4xYE4ug,2779
|
|
|
21
21
|
upgini/autofe/unary.py,sha256=gyMkrx9bfa3o19zS-4JaRlScHrfeZGBsYe7d_6ePT-0,2853
|
|
22
22
|
upgini/autofe/vector.py,sha256=Qk7VmdwURNwVw7fIMEspWEo7HTiyUWCYIqu3hcWQQio,507
|
|
23
23
|
upgini/data_source/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
24
|
-
upgini/data_source/data_source_publisher.py,sha256=
|
|
24
|
+
upgini/data_source/data_source_publisher.py,sha256=ZMNyh1x1S3QkXkA-PTtBQ-sbOiANtNioEQs8VoQ24Lk,15110
|
|
25
25
|
upgini/mdc/__init__.py,sha256=ETDh3JKbrDdPMOECiYLAa8lvKYe68mv4IY6fZa9FimA,1126
|
|
26
26
|
upgini/mdc/context.py,sha256=Sl1S_InKlzzRxYqwJ2k24lawJdCKWgGJ-RIRfvzWJrk,1468
|
|
27
27
|
upgini/normalizer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -34,29 +34,29 @@ upgini/sampler/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
|
34
34
|
upgini/sampler/base.py,sha256=CC-DvPbrN7zp5--SVFuUqkVmdWM_5F7R0Do98ETV82U,6421
|
|
35
35
|
upgini/sampler/random_under_sampler.py,sha256=XU4c2swPIFxVXHOPpxgM2bUao0Xm-aoMmd6fKjIuV5s,4068
|
|
36
36
|
upgini/sampler/utils.py,sha256=PYOk3kKSnFlyxcpdtDNLBEEhTB4lO_iP7pQHqeUcmAc,20211
|
|
37
|
-
upgini/utils/__init__.py,sha256=
|
|
37
|
+
upgini/utils/__init__.py,sha256=dQ4-s8-sZ5eOBZ-mH3gEwDHTdI0wI1bUAVgVqUKKPx4,786
|
|
38
38
|
upgini/utils/base_search_key_detector.py,sha256=DGwhXLvc8i5VZWMDr0rncFfV5GEHdsCSnLGon_W9TPs,859
|
|
39
39
|
upgini/utils/blocked_time_series.py,sha256=dMz5ewk3PsoeOrc3lDzInCVPS9u_2XQkV0W6PuMMjPg,3380
|
|
40
40
|
upgini/utils/country_utils.py,sha256=1KXhLSNqkNYVL3on8-zK0Arc_SspUH7AMZvGZICysOU,6462
|
|
41
41
|
upgini/utils/custom_loss_utils.py,sha256=DBslpjWGPt7xTeypt78baR59012SYphbPsO_YLKdilo,3972
|
|
42
42
|
upgini/utils/cv_utils.py,sha256=Tn01RJvpZGZh0PUQUimlBkV-AXwe7s6yjCNFtw352Uc,3525
|
|
43
|
-
upgini/utils/datetime_utils.py,sha256=
|
|
44
|
-
upgini/utils/deduplicate_utils.py,sha256=
|
|
43
|
+
upgini/utils/datetime_utils.py,sha256=P5no4mFgYpEP6oY524ebTKvKc3TBMJzAYpWdj210_Fw,8699
|
|
44
|
+
upgini/utils/deduplicate_utils.py,sha256=ckJrpU8Ruc_vcwIPTopbUjyJuNiseLHNAbQlLfhUCxo,5888
|
|
45
45
|
upgini/utils/display_utils.py,sha256=tiq5sFOfMwkKCjQ7OGdyK_twe0Qdr9F3mzkW1QXSDog,10664
|
|
46
46
|
upgini/utils/email_utils.py,sha256=3CvHXTSzlgLyGsQOXfRYVfFhfPy6OXG4uXOBWRaLfHg,3479
|
|
47
47
|
upgini/utils/fallback_progress_bar.py,sha256=cdbd1XGcWm4Ed4eAqV2_St3z7uC_kkH22gEyrN5ub6M,1090
|
|
48
|
-
upgini/utils/features_validator.py,sha256=
|
|
48
|
+
upgini/utils/features_validator.py,sha256=VexG-9p63ni66Hf9T7dgP4iUAhpXqwo3sgMwBK_eii8,2565
|
|
49
49
|
upgini/utils/format.py,sha256=Yv5cvvSs2bOLUzzNu96Pu33VMDNbabio92QepUj41jU,243
|
|
50
50
|
upgini/utils/ip_utils.py,sha256=Zf3F2cnQmOCH09QLQHetpjMFu1PnD0cTmDymn0SnSy8,1672
|
|
51
51
|
upgini/utils/phone_utils.py,sha256=JNSkF8G6mgsN8Czy11pamaJdsY6rBINEMpi7jbVt_RA,408
|
|
52
52
|
upgini/utils/postal_code_utils.py,sha256=_8CR9tBqsPptQsmMUvnrCAmBaMIQSWH3JfJ4ly3x_zs,409
|
|
53
53
|
upgini/utils/progress_bar.py,sha256=iNXyqT3vKCeHpfiG5HHwr7Lk2cTtKViM93Fl8iZnjGc,1564
|
|
54
54
|
upgini/utils/sklearn_ext.py,sha256=fvuTWJ5AnT3ED9KSaQu_yIgW2JR19hFlaGDoVP3k60g,44027
|
|
55
|
-
upgini/utils/target_utils.py,sha256=
|
|
55
|
+
upgini/utils/target_utils.py,sha256=DH812qcZ7Pvf9WVVb33fbwQjb1W9h1hXRNCCiG7Y6tI,2563
|
|
56
56
|
upgini/utils/track_info.py,sha256=EPcJ13Jqa17_T0JjM37Ac9kWDz5Zk0GVsIZKutOb8aU,5207
|
|
57
57
|
upgini/utils/warning_counter.py,sha256=dIWBB4dI5XRRJZudvIlqlIYKEiwLLPcXarsZuYRt338,227
|
|
58
|
-
upgini-1.1.
|
|
59
|
-
upgini-1.1.
|
|
60
|
-
upgini-1.1.
|
|
61
|
-
upgini-1.1.
|
|
62
|
-
upgini-1.1.
|
|
58
|
+
upgini-1.1.253.dist-info/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
|
|
59
|
+
upgini-1.1.253.dist-info/METADATA,sha256=6FwSFP4xzkd9GTHCyToBORKRQEriGSZKJPs1O1ujbcI,48156
|
|
60
|
+
upgini-1.1.253.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
|
|
61
|
+
upgini-1.1.253.dist-info/top_level.txt,sha256=OFhTGiDIWKl5gFI49qvWq1R9IKflPaE2PekcbDXDtx4,7
|
|
62
|
+
upgini-1.1.253.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|