upgini 1.2.12__py3-none-any.whl → 1.2.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- upgini/__about__.py +1 -1
- upgini/autofe/binary.py +4 -2
- upgini/features_enricher.py +1 -0
- upgini/metrics.py +1 -1
- upgini/resource_bundle/strings.properties +1 -1
- upgini/utils/deduplicate_utils.py +93 -88
- {upgini-1.2.12.dist-info → upgini-1.2.13.dist-info}/METADATA +1 -1
- {upgini-1.2.12.dist-info → upgini-1.2.13.dist-info}/RECORD +10 -10
- {upgini-1.2.12.dist-info → upgini-1.2.13.dist-info}/WHEEL +0 -0
- {upgini-1.2.12.dist-info → upgini-1.2.13.dist-info}/licenses/LICENSE +0 -0
upgini/__about__.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "1.2.
|
|
1
|
+
__version__ = "1.2.13"
|
upgini/autofe/binary.py
CHANGED
|
@@ -142,9 +142,9 @@ class Distance(PandasOperand):
|
|
|
142
142
|
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
143
143
|
return pd.Series(
|
|
144
144
|
1 - self.__dot(left, right) / (self.__norm(left) * self.__norm(right)), index=left.index
|
|
145
|
-
)
|
|
145
|
+
).astype(np.float64)
|
|
146
146
|
|
|
147
|
-
# row-wise dot product
|
|
147
|
+
# row-wise dot product, handling None values
|
|
148
148
|
def __dot(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
149
149
|
left = left.apply(lambda x: np.array(x))
|
|
150
150
|
right = right.apply(lambda x: np.array(x))
|
|
@@ -152,7 +152,9 @@ class Distance(PandasOperand):
|
|
|
152
152
|
res = res.reindex(left.index.union(right.index))
|
|
153
153
|
return res
|
|
154
154
|
|
|
155
|
+
# Calculate the norm of a vector, handling None values
|
|
155
156
|
def __norm(self, vector: pd.Series) -> pd.Series:
|
|
157
|
+
vector = vector.fillna(np.nan)
|
|
156
158
|
return np.sqrt(self.__dot(vector, vector))
|
|
157
159
|
|
|
158
160
|
|
upgini/features_enricher.py
CHANGED
|
@@ -3322,6 +3322,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3322
3322
|
# index overrites from result_features
|
|
3323
3323
|
original_index_name = df_with_original_index.index.name
|
|
3324
3324
|
df_with_original_index = df_with_original_index.reset_index()
|
|
3325
|
+
# TODO drop system_record_id before merge
|
|
3325
3326
|
result_features = pd.merge(
|
|
3326
3327
|
df_with_original_index,
|
|
3327
3328
|
result_features,
|
upgini/metrics.py
CHANGED
|
@@ -526,7 +526,7 @@ class CatBoostWrapper(EstimatorWrapper):
|
|
|
526
526
|
emb_name = "__grouped_embeddings"
|
|
527
527
|
df = df.copy()
|
|
528
528
|
df[self.emb_features] = df[self.emb_features].fillna(0.0)
|
|
529
|
-
df[emb_name] = df[self.emb_features].values.tolist()
|
|
529
|
+
df[emb_name] = pd.Series(df[self.emb_features].values.tolist())
|
|
530
530
|
df = df.drop(columns=self.emb_features)
|
|
531
531
|
|
|
532
532
|
return df, [emb_name]
|
|
@@ -96,7 +96,7 @@ invalid_date_format=Failed to parse date in column `{}`. Try to pass explicit da
|
|
|
96
96
|
unsupported_date_type=Unsupported type of date column `{}`. Convert to datetime please.
|
|
97
97
|
invalid_postal_code=All values of POSTAL_CODE column `{}` are invalid
|
|
98
98
|
invalid_country=All values of COUNTRY column `{}` are invalid
|
|
99
|
-
invalid_ip=All values of
|
|
99
|
+
invalid_ip=All values of IP column `{}` are invalid
|
|
100
100
|
# X and y validation
|
|
101
101
|
unsupported_x_type=Unsupported type of X: {}. Use pandas.DataFrame, pandas.Series or numpy.ndarray or list
|
|
102
102
|
x_contains_dup_columns=X contains duplicate column names. Please rename or drop duplicates
|
|
@@ -25,12 +25,11 @@ def remove_fintech_duplicates(
|
|
|
25
25
|
silent=False,
|
|
26
26
|
bundle: ResourceBundle = None,
|
|
27
27
|
) -> pd.DataFrame:
|
|
28
|
-
#
|
|
28
|
+
# Initial checks for target type and date column
|
|
29
29
|
date_col = _get_column_by_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
|
|
30
30
|
if define_task(df[TARGET], date_col is not None, silent=True) != ModelTaskType.BINARY:
|
|
31
31
|
return df
|
|
32
32
|
|
|
33
|
-
date_col = _get_column_by_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
|
|
34
33
|
if date_col is None:
|
|
35
34
|
return df
|
|
36
35
|
|
|
@@ -47,97 +46,103 @@ def remove_fintech_duplicates(
|
|
|
47
46
|
if len(personal_cols) == 0:
|
|
48
47
|
return df
|
|
49
48
|
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
msg = bundle.get("dataset_train_diff_target_duplicates_fintech").format(
|
|
94
|
-
perc, len(rows_to_remove), rows_to_remove.index.to_list()
|
|
95
|
-
)
|
|
96
|
-
if not silent:
|
|
97
|
-
print(msg)
|
|
98
|
-
if logger:
|
|
99
|
-
logger.warning(msg)
|
|
100
|
-
logger.info(f"Dataset shape before clean fintech duplicates: {df.shape}")
|
|
101
|
-
df = df[~df.index.isin(rows_to_remove.index)]
|
|
102
|
-
logger.info(f"Dataset shape after clean fintech duplicates: {df.shape}")
|
|
103
|
-
else:
|
|
104
|
-
# Indices in train and eval_set can be the same so we remove rows from them separately
|
|
105
|
-
train = df.query(f"{EVAL_SET_INDEX} == 0")
|
|
106
|
-
train_rows_to_remove = pd.merge(train.reset_index(), unique_keys_to_delete, on=personal_cols)
|
|
107
|
-
train_rows_to_remove = train_rows_to_remove.set_index(train.index.name or "index")
|
|
108
|
-
train_perc = len(train_rows_to_remove) * 100 / len(train)
|
|
109
|
-
msg = bundle.get("dataset_train_diff_target_duplicates_fintech").format(
|
|
110
|
-
train_perc, len(train_rows_to_remove), train_rows_to_remove.index.to_list()
|
|
49
|
+
# Splitting into train and eval_set parts
|
|
50
|
+
if EVAL_SET_INDEX in df.columns:
|
|
51
|
+
train_df = df[df[EVAL_SET_INDEX] == 0]
|
|
52
|
+
eval_dfs = [df[df[EVAL_SET_INDEX] == idx] for idx in df[EVAL_SET_INDEX].unique() if idx != 0]
|
|
53
|
+
else:
|
|
54
|
+
train_df = df
|
|
55
|
+
eval_dfs = []
|
|
56
|
+
|
|
57
|
+
def process_df(segment_df: pd.DataFrame, eval_index=0) -> pd.DataFrame:
|
|
58
|
+
"""Process a subset of the dataset to remove duplicates based on personal keys."""
|
|
59
|
+
# Fast check for duplicates based on personal keys
|
|
60
|
+
if not segment_df[personal_cols].duplicated().any():
|
|
61
|
+
return segment_df
|
|
62
|
+
|
|
63
|
+
sub_df = segment_df[personal_cols + [date_col, TARGET]].copy()
|
|
64
|
+
|
|
65
|
+
# Group by personal columns to check for unique dates
|
|
66
|
+
grouped_by_personal_cols = sub_df.groupby(personal_cols, group_keys=False)
|
|
67
|
+
|
|
68
|
+
# Checking for different dates by the same personal keys
|
|
69
|
+
uniques = grouped_by_personal_cols[date_col].nunique()
|
|
70
|
+
total = len(uniques)
|
|
71
|
+
diff_dates = len(uniques[uniques > 1])
|
|
72
|
+
if diff_dates / total >= 0.6:
|
|
73
|
+
return segment_df
|
|
74
|
+
|
|
75
|
+
# Check for duplicate rows
|
|
76
|
+
duplicates = sub_df.duplicated(personal_cols, keep=False)
|
|
77
|
+
duplicate_rows = sub_df[duplicates]
|
|
78
|
+
if len(duplicate_rows) == 0:
|
|
79
|
+
return segment_df
|
|
80
|
+
|
|
81
|
+
# Check if there are different target values for the same personal keys
|
|
82
|
+
nonunique_target_groups = grouped_by_personal_cols[TARGET].nunique() > 1
|
|
83
|
+
if nonunique_target_groups.sum() == 0:
|
|
84
|
+
return segment_df
|
|
85
|
+
|
|
86
|
+
# Helper function to check if there are different target values within 60 days
|
|
87
|
+
def has_diff_target_within_60_days(rows: pd.DataFrame):
|
|
88
|
+
rows = rows.sort_values(by=date_col)
|
|
89
|
+
return (
|
|
90
|
+
len(rows[rows[TARGET].ne(rows[TARGET].shift()) & (rows[date_col].diff() < 60 * 24 * 60 * 60 * 1000)])
|
|
91
|
+
> 0
|
|
111
92
|
)
|
|
93
|
+
|
|
94
|
+
# Filter rows with different target values within 60 days
|
|
95
|
+
nonunique_target_rows = nonunique_target_groups[nonunique_target_groups].reset_index().drop(columns=TARGET)
|
|
96
|
+
sub_df = pd.merge(sub_df, nonunique_target_rows, on=personal_cols)
|
|
97
|
+
|
|
98
|
+
# Convert date columns for further checks
|
|
99
|
+
sub_df = DateTimeSearchKeyConverter(date_col, date_format=date_format, logger=logger, bundle=bundle).convert(
|
|
100
|
+
sub_df
|
|
101
|
+
)
|
|
102
|
+
grouped_by_personal_cols = sub_df.groupby(personal_cols, group_keys=False)
|
|
103
|
+
rows_with_diff_target = grouped_by_personal_cols.filter(has_diff_target_within_60_days)
|
|
104
|
+
|
|
105
|
+
if len(rows_with_diff_target) > 0:
|
|
106
|
+
unique_keys_to_delete = rows_with_diff_target[personal_cols].drop_duplicates()
|
|
107
|
+
rows_to_remove = pd.merge(segment_df.reset_index(), unique_keys_to_delete, on=personal_cols)
|
|
108
|
+
rows_to_remove = rows_to_remove.set_index(segment_df.index.name or "index")
|
|
109
|
+
perc = len(rows_to_remove) * 100 / len(segment_df)
|
|
110
|
+
if eval_index == 0:
|
|
111
|
+
msg = bundle.get("dataset_train_diff_target_duplicates_fintech").format(
|
|
112
|
+
perc, len(rows_to_remove), rows_to_remove.index.to_list()
|
|
113
|
+
)
|
|
114
|
+
else:
|
|
115
|
+
msg = bundle.get("dataset_eval_diff_target_duplicates_fintech").format(
|
|
116
|
+
perc, len(rows_to_remove), eval_index, rows_to_remove.index.to_list()
|
|
117
|
+
)
|
|
112
118
|
if not silent:
|
|
113
119
|
print(msg)
|
|
114
120
|
if logger:
|
|
115
121
|
logger.warning(msg)
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
logger.info(f"Dataset shape after clean fintech duplicates: {df.shape}")
|
|
122
|
+
return segment_df[~segment_df.index.isin(rows_to_remove.index)]
|
|
123
|
+
return segment_df
|
|
124
|
+
|
|
125
|
+
# Process the train part separately
|
|
126
|
+
logger.info(f"Train dataset shape before clean fintech duplicates: {train_df.shape}")
|
|
127
|
+
train_df = process_df(train_df)
|
|
128
|
+
logger.info(f"Train dataset shape after clean fintech duplicates: {train_df.shape}")
|
|
129
|
+
|
|
130
|
+
# Process each eval_set part separately
|
|
131
|
+
new_eval_dfs = []
|
|
132
|
+
for i, eval_df in enumerate(eval_dfs, 1):
|
|
133
|
+
logger.info(f"Eval {i} dataset shape before clean fintech duplicates: {eval_df.shape}")
|
|
134
|
+
cleaned_eval_df = process_df(eval_df, i)
|
|
135
|
+
logger.info(f"Eval {i} dataset shape after clean fintech duplicates: {cleaned_eval_df.shape}")
|
|
136
|
+
new_eval_dfs.append(cleaned_eval_df)
|
|
137
|
+
|
|
138
|
+
# Combine the processed train and eval parts back into one dataset
|
|
139
|
+
logger.info(f"Dataset shape before clean fintech duplicates: {df.shape}")
|
|
140
|
+
if new_eval_dfs:
|
|
141
|
+
df = pd.concat([train_df] + new_eval_dfs)
|
|
142
|
+
else:
|
|
143
|
+
df = train_df
|
|
144
|
+
logger.info(f"Dataset shape after clean fintech duplicates: {df.shape}")
|
|
145
|
+
|
|
141
146
|
return df
|
|
142
147
|
|
|
143
148
|
|
|
@@ -1,13 +1,13 @@
|
|
|
1
|
-
upgini/__about__.py,sha256=
|
|
1
|
+
upgini/__about__.py,sha256=rQSlPcfj4yT4krIq6epTVQyBzIX4etVOgfupVkM-RnU,23
|
|
2
2
|
upgini/__init__.py,sha256=M64LwQTBa-5Jz24Zm2h8rWwlKQQ1J8nP7gGgIciS0WU,589
|
|
3
3
|
upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
|
|
4
4
|
upgini/dataset.py,sha256=olZ-OHSfBNoBSCo7R5t7uCLukI2nO7afpx_A-HCiJLk,31067
|
|
5
5
|
upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
|
|
6
|
-
upgini/features_enricher.py,sha256=
|
|
6
|
+
upgini/features_enricher.py,sha256=HJJZbZScVrl6ugDBQE71m7om5-ahvMyEnAqZNw-OEJ0,188058
|
|
7
7
|
upgini/http.py,sha256=21asexflvavydzCOONJDGQBtQanCElrbnqLXakJ9Cu8,42880
|
|
8
8
|
upgini/lazy_import.py,sha256=74gQ8JuA48BGRLxAo7lNHNKY2D2emMxrUxKGdxVGhuY,1012
|
|
9
9
|
upgini/metadata.py,sha256=osmzdNESeh7yP3BZday6N9Q3eaIHfzhhRM1d6NSgcf0,11223
|
|
10
|
-
upgini/metrics.py,sha256=
|
|
10
|
+
upgini/metrics.py,sha256=bgi1rc3vCCeCuwRX1doQSQCzaV5OEiYHv_6XIvapnaw,31254
|
|
11
11
|
upgini/search_task.py,sha256=qxUxAD-bed-FpZYmTB_4orW7YJsW_O6a1TcgnZIRFr4,17307
|
|
12
12
|
upgini/spinner.py,sha256=4iMd-eIe_BnkqFEMIliULTbj6rNI2HkN_VJ4qYe0cUc,1118
|
|
13
13
|
upgini/version_validator.py,sha256=ddSKUK_-eGJB3NgrqOMoWJU-OxQ253WsNLp8aqJkaIM,1389
|
|
@@ -15,7 +15,7 @@ upgini/ads_management/__init__.py,sha256=qzyisOToVRP-tquAJD1PblZhNtMrOB8FiyF9Jvf
|
|
|
15
15
|
upgini/ads_management/ads_manager.py,sha256=igVbN2jz80Umb2BUJixmJVj-zx8unoKpecVo-R-nGdw,2648
|
|
16
16
|
upgini/autofe/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
17
17
|
upgini/autofe/all_operands.py,sha256=cCCB44qvkmuWyiRM5Xykx8tkHPIjQthrWyj67STWN80,2578
|
|
18
|
-
upgini/autofe/binary.py,sha256=
|
|
18
|
+
upgini/autofe/binary.py,sha256=zMhtHVuGUAFLUqem-XiXqJj-GRXxS88tdz8tFuDfSNM,7659
|
|
19
19
|
upgini/autofe/date.py,sha256=OpFc3Al0xO3qlESn2Uokfxw51ArVqmh3xngWwdrsaqE,9762
|
|
20
20
|
upgini/autofe/feature.py,sha256=eL7wABUhDKZzv3E-RPJNcyGwSfB0UptcfU2RbvsOks4,15082
|
|
21
21
|
upgini/autofe/groupby.py,sha256=r-xl_keZZgm_tpiEoDhjYSkT6NHv7a4cRQR4wJ4uCp8,3263
|
|
@@ -30,7 +30,7 @@ upgini/normalizer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU
|
|
|
30
30
|
upgini/normalizer/normalize_utils.py,sha256=bHRPWCNrUvt2R9qMX6dZFCJ0i8ENVCQ2Rw3dHH9IJEg,7447
|
|
31
31
|
upgini/resource_bundle/__init__.py,sha256=S5F2G47pnJd2LDpmFsjDqEwiKkP8Hm-hcseDbMka6Ko,8345
|
|
32
32
|
upgini/resource_bundle/exceptions.py,sha256=5fRvx0_vWdE1-7HcSgF0tckB4A9AKyf5RiinZkInTsI,621
|
|
33
|
-
upgini/resource_bundle/strings.properties,sha256=
|
|
33
|
+
upgini/resource_bundle/strings.properties,sha256=9kvmcUrsSFUCrzOiN0Ozf-lQ2H8Igz5gATUPoHMOaU4,26456
|
|
34
34
|
upgini/resource_bundle/strings_widget.properties,sha256=gOdqvZWntP2LCza_tyVk1_yRYcG4c04K9sQOAVhF_gw,1577
|
|
35
35
|
upgini/sampler/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
36
36
|
upgini/sampler/base.py,sha256=7GpjYqjOp58vYcJLiX__1R5wjUlyQbxvHJ2klFnup_M,6389
|
|
@@ -43,7 +43,7 @@ upgini/utils/country_utils.py,sha256=lY-eXWwFVegdVENFttbvLcgGDjFO17Sex8hd2PyJaRk
|
|
|
43
43
|
upgini/utils/custom_loss_utils.py,sha256=kieNZYBYZm5ZGBltF1F_jOSF4ea6C29rYuCyiDcqVNY,3857
|
|
44
44
|
upgini/utils/cv_utils.py,sha256=w6FQb9nO8BWDx88EF83NpjPLarK4eR4ia0Wg0kLBJC4,3525
|
|
45
45
|
upgini/utils/datetime_utils.py,sha256=4tsGeehU0KS6wqNsc9gEEWZ9s6T9E0UReUIO3rSuXNU,12174
|
|
46
|
-
upgini/utils/deduplicate_utils.py,sha256=
|
|
46
|
+
upgini/utils/deduplicate_utils.py,sha256=NpaPtBYXwUtfKTRHWrtz2uUq6tZN6C_Nd719ydPRF2Q,8484
|
|
47
47
|
upgini/utils/display_utils.py,sha256=A2ouB5eiZ-Kyt9ykYxkLQwyoRPrdYeJymwNTiajtFXs,10990
|
|
48
48
|
upgini/utils/email_utils.py,sha256=j0Ug1R_0AnCg1Y92zIZ4XMwvKo3G5_pcOlBN1OH_gZs,5191
|
|
49
49
|
upgini/utils/fallback_progress_bar.py,sha256=PDaKb8dYpVZaWMroNcOHsTc3pSjgi9mOm0--cOFTwJ0,1074
|
|
@@ -57,7 +57,7 @@ upgini/utils/sklearn_ext.py,sha256=13jQS_k7v0aUtudXV6nGUEWjttPQzAW9AFYL5wgEz9k,4
|
|
|
57
57
|
upgini/utils/target_utils.py,sha256=BVtDmrmFMKerSUWaNOIEdzsYHIFiODdpnWbE50QDPDc,7864
|
|
58
58
|
upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
|
|
59
59
|
upgini/utils/warning_counter.py,sha256=dIWBB4dI5XRRJZudvIlqlIYKEiwLLPcXarsZuYRt338,227
|
|
60
|
-
upgini-1.2.
|
|
61
|
-
upgini-1.2.
|
|
62
|
-
upgini-1.2.
|
|
63
|
-
upgini-1.2.
|
|
60
|
+
upgini-1.2.13.dist-info/METADATA,sha256=IRJWMi0M4nUgCqMwp4kffx8QXgR1DJ2VsqH5Y7-nQ2E,48577
|
|
61
|
+
upgini-1.2.13.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
|
|
62
|
+
upgini-1.2.13.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
|
|
63
|
+
upgini-1.2.13.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|