upgini 1.2.11__tar.gz → 1.2.13__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- {upgini-1.2.11 → upgini-1.2.13}/PKG-INFO +1 -1
- upgini-1.2.13/src/upgini/__about__.py +1 -0
- {upgini-1.2.11 → upgini-1.2.13}/src/upgini/autofe/binary.py +4 -2
- {upgini-1.2.11 → upgini-1.2.13}/src/upgini/features_enricher.py +1 -0
- {upgini-1.2.11 → upgini-1.2.13}/src/upgini/metrics.py +1 -1
- {upgini-1.2.11 → upgini-1.2.13}/src/upgini/resource_bundle/strings.properties +2 -2
- upgini-1.2.13/src/upgini/utils/deduplicate_utils.py +200 -0
- {upgini-1.2.11 → upgini-1.2.13}/src/upgini/utils/features_validator.py +11 -11
- upgini-1.2.11/src/upgini/__about__.py +0 -1
- upgini-1.2.11/src/upgini/utils/deduplicate_utils.py +0 -195
- {upgini-1.2.11 → upgini-1.2.13}/.gitignore +0 -0
- {upgini-1.2.11 → upgini-1.2.13}/LICENSE +0 -0
- {upgini-1.2.11 → upgini-1.2.13}/README.md +0 -0
- {upgini-1.2.11 → upgini-1.2.13}/pyproject.toml +0 -0
- {upgini-1.2.11 → upgini-1.2.13}/src/upgini/__init__.py +0 -0
- {upgini-1.2.11 → upgini-1.2.13}/src/upgini/ads.py +0 -0
- {upgini-1.2.11 → upgini-1.2.13}/src/upgini/ads_management/__init__.py +0 -0
- {upgini-1.2.11 → upgini-1.2.13}/src/upgini/ads_management/ads_manager.py +0 -0
- {upgini-1.2.11 → upgini-1.2.13}/src/upgini/autofe/__init__.py +0 -0
- {upgini-1.2.11 → upgini-1.2.13}/src/upgini/autofe/all_operands.py +0 -0
- {upgini-1.2.11 → upgini-1.2.13}/src/upgini/autofe/date.py +0 -0
- {upgini-1.2.11 → upgini-1.2.13}/src/upgini/autofe/feature.py +0 -0
- {upgini-1.2.11 → upgini-1.2.13}/src/upgini/autofe/groupby.py +0 -0
- {upgini-1.2.11 → upgini-1.2.13}/src/upgini/autofe/operand.py +0 -0
- {upgini-1.2.11 → upgini-1.2.13}/src/upgini/autofe/unary.py +0 -0
- {upgini-1.2.11 → upgini-1.2.13}/src/upgini/autofe/vector.py +0 -0
- {upgini-1.2.11 → upgini-1.2.13}/src/upgini/data_source/__init__.py +0 -0
- {upgini-1.2.11 → upgini-1.2.13}/src/upgini/data_source/data_source_publisher.py +0 -0
- {upgini-1.2.11 → upgini-1.2.13}/src/upgini/dataset.py +0 -0
- {upgini-1.2.11 → upgini-1.2.13}/src/upgini/errors.py +0 -0
- {upgini-1.2.11 → upgini-1.2.13}/src/upgini/http.py +0 -0
- {upgini-1.2.11 → upgini-1.2.13}/src/upgini/lazy_import.py +0 -0
- {upgini-1.2.11 → upgini-1.2.13}/src/upgini/mdc/__init__.py +0 -0
- {upgini-1.2.11 → upgini-1.2.13}/src/upgini/mdc/context.py +0 -0
- {upgini-1.2.11 → upgini-1.2.13}/src/upgini/metadata.py +0 -0
- {upgini-1.2.11 → upgini-1.2.13}/src/upgini/normalizer/__init__.py +0 -0
- {upgini-1.2.11 → upgini-1.2.13}/src/upgini/normalizer/normalize_utils.py +0 -0
- {upgini-1.2.11 → upgini-1.2.13}/src/upgini/resource_bundle/__init__.py +0 -0
- {upgini-1.2.11 → upgini-1.2.13}/src/upgini/resource_bundle/exceptions.py +0 -0
- {upgini-1.2.11 → upgini-1.2.13}/src/upgini/resource_bundle/strings_widget.properties +0 -0
- {upgini-1.2.11 → upgini-1.2.13}/src/upgini/sampler/__init__.py +0 -0
- {upgini-1.2.11 → upgini-1.2.13}/src/upgini/sampler/base.py +0 -0
- {upgini-1.2.11 → upgini-1.2.13}/src/upgini/sampler/random_under_sampler.py +0 -0
- {upgini-1.2.11 → upgini-1.2.13}/src/upgini/sampler/utils.py +0 -0
- {upgini-1.2.11 → upgini-1.2.13}/src/upgini/search_task.py +0 -0
- {upgini-1.2.11 → upgini-1.2.13}/src/upgini/spinner.py +0 -0
- {upgini-1.2.11 → upgini-1.2.13}/src/upgini/utils/__init__.py +0 -0
- {upgini-1.2.11 → upgini-1.2.13}/src/upgini/utils/base_search_key_detector.py +0 -0
- {upgini-1.2.11 → upgini-1.2.13}/src/upgini/utils/blocked_time_series.py +0 -0
- {upgini-1.2.11 → upgini-1.2.13}/src/upgini/utils/country_utils.py +0 -0
- {upgini-1.2.11 → upgini-1.2.13}/src/upgini/utils/custom_loss_utils.py +0 -0
- {upgini-1.2.11 → upgini-1.2.13}/src/upgini/utils/cv_utils.py +0 -0
- {upgini-1.2.11 → upgini-1.2.13}/src/upgini/utils/datetime_utils.py +0 -0
- {upgini-1.2.11 → upgini-1.2.13}/src/upgini/utils/display_utils.py +0 -0
- {upgini-1.2.11 → upgini-1.2.13}/src/upgini/utils/email_utils.py +0 -0
- {upgini-1.2.11 → upgini-1.2.13}/src/upgini/utils/fallback_progress_bar.py +0 -0
- {upgini-1.2.11 → upgini-1.2.13}/src/upgini/utils/format.py +0 -0
- {upgini-1.2.11 → upgini-1.2.13}/src/upgini/utils/ip_utils.py +0 -0
- {upgini-1.2.11 → upgini-1.2.13}/src/upgini/utils/phone_utils.py +0 -0
- {upgini-1.2.11 → upgini-1.2.13}/src/upgini/utils/postal_code_utils.py +0 -0
- {upgini-1.2.11 → upgini-1.2.13}/src/upgini/utils/progress_bar.py +0 -0
- {upgini-1.2.11 → upgini-1.2.13}/src/upgini/utils/sklearn_ext.py +0 -0
- {upgini-1.2.11 → upgini-1.2.13}/src/upgini/utils/target_utils.py +0 -0
- {upgini-1.2.11 → upgini-1.2.13}/src/upgini/utils/track_info.py +0 -0
- {upgini-1.2.11 → upgini-1.2.13}/src/upgini/utils/warning_counter.py +0 -0
- {upgini-1.2.11 → upgini-1.2.13}/src/upgini/version_validator.py +0 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "1.2.13"
|
|
@@ -142,9 +142,9 @@ class Distance(PandasOperand):
|
|
|
142
142
|
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
143
143
|
return pd.Series(
|
|
144
144
|
1 - self.__dot(left, right) / (self.__norm(left) * self.__norm(right)), index=left.index
|
|
145
|
-
)
|
|
145
|
+
).astype(np.float64)
|
|
146
146
|
|
|
147
|
-
# row-wise dot product
|
|
147
|
+
# row-wise dot product, handling None values
|
|
148
148
|
def __dot(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
149
149
|
left = left.apply(lambda x: np.array(x))
|
|
150
150
|
right = right.apply(lambda x: np.array(x))
|
|
@@ -152,7 +152,9 @@ class Distance(PandasOperand):
|
|
|
152
152
|
res = res.reindex(left.index.union(right.index))
|
|
153
153
|
return res
|
|
154
154
|
|
|
155
|
+
# Calculate the norm of a vector, handling None values
|
|
155
156
|
def __norm(self, vector: pd.Series) -> pd.Series:
|
|
157
|
+
vector = vector.fillna(np.nan)
|
|
156
158
|
return np.sqrt(self.__dot(vector, vector))
|
|
157
159
|
|
|
158
160
|
|
|
@@ -3322,6 +3322,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3322
3322
|
# index overrites from result_features
|
|
3323
3323
|
original_index_name = df_with_original_index.index.name
|
|
3324
3324
|
df_with_original_index = df_with_original_index.reset_index()
|
|
3325
|
+
# TODO drop system_record_id before merge
|
|
3325
3326
|
result_features = pd.merge(
|
|
3326
3327
|
df_with_original_index,
|
|
3327
3328
|
result_features,
|
|
@@ -526,7 +526,7 @@ class CatBoostWrapper(EstimatorWrapper):
|
|
|
526
526
|
emb_name = "__grouped_embeddings"
|
|
527
527
|
df = df.copy()
|
|
528
528
|
df[self.emb_features] = df[self.emb_features].fillna(0.0)
|
|
529
|
-
df[emb_name] = df[self.emb_features].values.tolist()
|
|
529
|
+
df[emb_name] = pd.Series(df[self.emb_features].values.tolist())
|
|
530
530
|
df = df.drop(columns=self.emb_features)
|
|
531
531
|
|
|
532
532
|
return df, [emb_name]
|
|
@@ -22,7 +22,7 @@ slack_community_bage=https://img.shields.io/badge/slack-@upgini-orange.svg?logo=
|
|
|
22
22
|
slack_community_alt=Upgini Slack community
|
|
23
23
|
version_warning=\nWARNING: Unsupported library version detected {},\nplease update with “%pip install -U upgini” to the latest {} and restart Jupyter kernel
|
|
24
24
|
unregistered_with_personal_keys=\nWARNING: Search key {} can be used only with personal api_key from profile.upgini.com It will be ignored
|
|
25
|
-
date_only_search=\nWARNING: Search started with DATE search key only\nTry to add other keys like the COUNTRY, POSTAL_CODE, PHONE NUMBER, EMAIL/HEM,
|
|
25
|
+
date_only_search=\nWARNING: Search started with DATE search key only\nTry to add other keys like the COUNTRY, POSTAL_CODE, PHONE NUMBER, EMAIL/HEM, IP to your training dataset\nfor search through all the available data sources.\nSee docs https://github.com/upgini/upgini#-total-239-countries-and-up-to-41-years-of-history
|
|
26
26
|
date_search_without_time_series=\nWARNING: Looks like your training dataset is a time series. We recommend to set `cv=CVType.time_series` param for correct search results.\nSee docs https://github.com/upgini/upgini#-time-series-prediction-support
|
|
27
27
|
metrics_exclude_paid_features=\nWARNING: Metrics calculated after enrichment has a free features only. To calculate metrics with a full set of relevant features, including commercial data sources, please contact support team:
|
|
28
28
|
metrics_no_important_free_features=\nWARNING: No important free features to calculate metrics
|
|
@@ -96,7 +96,7 @@ invalid_date_format=Failed to parse date in column `{}`. Try to pass explicit da
|
|
|
96
96
|
unsupported_date_type=Unsupported type of date column `{}`. Convert to datetime please.
|
|
97
97
|
invalid_postal_code=All values of POSTAL_CODE column `{}` are invalid
|
|
98
98
|
invalid_country=All values of COUNTRY column `{}` are invalid
|
|
99
|
-
invalid_ip=All values of
|
|
99
|
+
invalid_ip=All values of IP column `{}` are invalid
|
|
100
100
|
# X and y validation
|
|
101
101
|
unsupported_x_type=Unsupported type of X: {}. Use pandas.DataFrame, pandas.Series or numpy.ndarray or list
|
|
102
102
|
x_contains_dup_columns=X contains duplicate column names. Please rename or drop duplicates
|
|
@@ -0,0 +1,200 @@
|
|
|
1
|
+
from logging import Logger
|
|
2
|
+
from typing import Dict, List, Optional, Union
|
|
3
|
+
|
|
4
|
+
import pandas as pd
|
|
5
|
+
|
|
6
|
+
from upgini.metadata import (
|
|
7
|
+
ENTITY_SYSTEM_RECORD_ID,
|
|
8
|
+
EVAL_SET_INDEX,
|
|
9
|
+
SORT_ID,
|
|
10
|
+
SYSTEM_RECORD_ID,
|
|
11
|
+
TARGET,
|
|
12
|
+
ModelTaskType,
|
|
13
|
+
SearchKey,
|
|
14
|
+
)
|
|
15
|
+
from upgini.resource_bundle import ResourceBundle
|
|
16
|
+
from upgini.utils.datetime_utils import DateTimeSearchKeyConverter
|
|
17
|
+
from upgini.utils.target_utils import define_task
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def remove_fintech_duplicates(
|
|
21
|
+
df: pd.DataFrame,
|
|
22
|
+
search_keys: Dict[str, SearchKey],
|
|
23
|
+
date_format: Optional[str] = None,
|
|
24
|
+
logger: Optional[Logger] = None,
|
|
25
|
+
silent=False,
|
|
26
|
+
bundle: ResourceBundle = None,
|
|
27
|
+
) -> pd.DataFrame:
|
|
28
|
+
# Initial checks for target type and date column
|
|
29
|
+
date_col = _get_column_by_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
|
|
30
|
+
if define_task(df[TARGET], date_col is not None, silent=True) != ModelTaskType.BINARY:
|
|
31
|
+
return df
|
|
32
|
+
|
|
33
|
+
if date_col is None:
|
|
34
|
+
return df
|
|
35
|
+
|
|
36
|
+
personal_cols = []
|
|
37
|
+
phone_col = _get_column_by_key(search_keys, SearchKey.PHONE)
|
|
38
|
+
if phone_col:
|
|
39
|
+
personal_cols.append(phone_col)
|
|
40
|
+
email_col = _get_column_by_key(search_keys, SearchKey.EMAIL)
|
|
41
|
+
if email_col:
|
|
42
|
+
personal_cols.append(email_col)
|
|
43
|
+
hem_col = _get_column_by_key(search_keys, SearchKey.HEM)
|
|
44
|
+
if hem_col:
|
|
45
|
+
personal_cols.append(hem_col)
|
|
46
|
+
if len(personal_cols) == 0:
|
|
47
|
+
return df
|
|
48
|
+
|
|
49
|
+
# Splitting into train and eval_set parts
|
|
50
|
+
if EVAL_SET_INDEX in df.columns:
|
|
51
|
+
train_df = df[df[EVAL_SET_INDEX] == 0]
|
|
52
|
+
eval_dfs = [df[df[EVAL_SET_INDEX] == idx] for idx in df[EVAL_SET_INDEX].unique() if idx != 0]
|
|
53
|
+
else:
|
|
54
|
+
train_df = df
|
|
55
|
+
eval_dfs = []
|
|
56
|
+
|
|
57
|
+
def process_df(segment_df: pd.DataFrame, eval_index=0) -> pd.DataFrame:
|
|
58
|
+
"""Process a subset of the dataset to remove duplicates based on personal keys."""
|
|
59
|
+
# Fast check for duplicates based on personal keys
|
|
60
|
+
if not segment_df[personal_cols].duplicated().any():
|
|
61
|
+
return segment_df
|
|
62
|
+
|
|
63
|
+
sub_df = segment_df[personal_cols + [date_col, TARGET]].copy()
|
|
64
|
+
|
|
65
|
+
# Group by personal columns to check for unique dates
|
|
66
|
+
grouped_by_personal_cols = sub_df.groupby(personal_cols, group_keys=False)
|
|
67
|
+
|
|
68
|
+
# Checking for different dates by the same personal keys
|
|
69
|
+
uniques = grouped_by_personal_cols[date_col].nunique()
|
|
70
|
+
total = len(uniques)
|
|
71
|
+
diff_dates = len(uniques[uniques > 1])
|
|
72
|
+
if diff_dates / total >= 0.6:
|
|
73
|
+
return segment_df
|
|
74
|
+
|
|
75
|
+
# Check for duplicate rows
|
|
76
|
+
duplicates = sub_df.duplicated(personal_cols, keep=False)
|
|
77
|
+
duplicate_rows = sub_df[duplicates]
|
|
78
|
+
if len(duplicate_rows) == 0:
|
|
79
|
+
return segment_df
|
|
80
|
+
|
|
81
|
+
# Check if there are different target values for the same personal keys
|
|
82
|
+
nonunique_target_groups = grouped_by_personal_cols[TARGET].nunique() > 1
|
|
83
|
+
if nonunique_target_groups.sum() == 0:
|
|
84
|
+
return segment_df
|
|
85
|
+
|
|
86
|
+
# Helper function to check if there are different target values within 60 days
|
|
87
|
+
def has_diff_target_within_60_days(rows: pd.DataFrame):
|
|
88
|
+
rows = rows.sort_values(by=date_col)
|
|
89
|
+
return (
|
|
90
|
+
len(rows[rows[TARGET].ne(rows[TARGET].shift()) & (rows[date_col].diff() < 60 * 24 * 60 * 60 * 1000)])
|
|
91
|
+
> 0
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
# Filter rows with different target values within 60 days
|
|
95
|
+
nonunique_target_rows = nonunique_target_groups[nonunique_target_groups].reset_index().drop(columns=TARGET)
|
|
96
|
+
sub_df = pd.merge(sub_df, nonunique_target_rows, on=personal_cols)
|
|
97
|
+
|
|
98
|
+
# Convert date columns for further checks
|
|
99
|
+
sub_df = DateTimeSearchKeyConverter(date_col, date_format=date_format, logger=logger, bundle=bundle).convert(
|
|
100
|
+
sub_df
|
|
101
|
+
)
|
|
102
|
+
grouped_by_personal_cols = sub_df.groupby(personal_cols, group_keys=False)
|
|
103
|
+
rows_with_diff_target = grouped_by_personal_cols.filter(has_diff_target_within_60_days)
|
|
104
|
+
|
|
105
|
+
if len(rows_with_diff_target) > 0:
|
|
106
|
+
unique_keys_to_delete = rows_with_diff_target[personal_cols].drop_duplicates()
|
|
107
|
+
rows_to_remove = pd.merge(segment_df.reset_index(), unique_keys_to_delete, on=personal_cols)
|
|
108
|
+
rows_to_remove = rows_to_remove.set_index(segment_df.index.name or "index")
|
|
109
|
+
perc = len(rows_to_remove) * 100 / len(segment_df)
|
|
110
|
+
if eval_index == 0:
|
|
111
|
+
msg = bundle.get("dataset_train_diff_target_duplicates_fintech").format(
|
|
112
|
+
perc, len(rows_to_remove), rows_to_remove.index.to_list()
|
|
113
|
+
)
|
|
114
|
+
else:
|
|
115
|
+
msg = bundle.get("dataset_eval_diff_target_duplicates_fintech").format(
|
|
116
|
+
perc, len(rows_to_remove), eval_index, rows_to_remove.index.to_list()
|
|
117
|
+
)
|
|
118
|
+
if not silent:
|
|
119
|
+
print(msg)
|
|
120
|
+
if logger:
|
|
121
|
+
logger.warning(msg)
|
|
122
|
+
return segment_df[~segment_df.index.isin(rows_to_remove.index)]
|
|
123
|
+
return segment_df
|
|
124
|
+
|
|
125
|
+
# Process the train part separately
|
|
126
|
+
logger.info(f"Train dataset shape before clean fintech duplicates: {train_df.shape}")
|
|
127
|
+
train_df = process_df(train_df)
|
|
128
|
+
logger.info(f"Train dataset shape after clean fintech duplicates: {train_df.shape}")
|
|
129
|
+
|
|
130
|
+
# Process each eval_set part separately
|
|
131
|
+
new_eval_dfs = []
|
|
132
|
+
for i, eval_df in enumerate(eval_dfs, 1):
|
|
133
|
+
logger.info(f"Eval {i} dataset shape before clean fintech duplicates: {eval_df.shape}")
|
|
134
|
+
cleaned_eval_df = process_df(eval_df, i)
|
|
135
|
+
logger.info(f"Eval {i} dataset shape after clean fintech duplicates: {cleaned_eval_df.shape}")
|
|
136
|
+
new_eval_dfs.append(cleaned_eval_df)
|
|
137
|
+
|
|
138
|
+
# Combine the processed train and eval parts back into one dataset
|
|
139
|
+
logger.info(f"Dataset shape before clean fintech duplicates: {df.shape}")
|
|
140
|
+
if new_eval_dfs:
|
|
141
|
+
df = pd.concat([train_df] + new_eval_dfs)
|
|
142
|
+
else:
|
|
143
|
+
df = train_df
|
|
144
|
+
logger.info(f"Dataset shape after clean fintech duplicates: {df.shape}")
|
|
145
|
+
|
|
146
|
+
return df
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
def clean_full_duplicates(
|
|
150
|
+
df: pd.DataFrame, logger: Optional[Logger] = None, silent=False, bundle: ResourceBundle = None
|
|
151
|
+
) -> pd.DataFrame:
|
|
152
|
+
nrows = len(df)
|
|
153
|
+
if nrows == 0:
|
|
154
|
+
return df
|
|
155
|
+
# Remove full duplicates (exclude system_record_id, sort_id and eval_set_index)
|
|
156
|
+
unique_columns = df.columns.tolist()
|
|
157
|
+
if SYSTEM_RECORD_ID in unique_columns:
|
|
158
|
+
unique_columns.remove(SYSTEM_RECORD_ID)
|
|
159
|
+
if ENTITY_SYSTEM_RECORD_ID in unique_columns:
|
|
160
|
+
unique_columns.remove(ENTITY_SYSTEM_RECORD_ID)
|
|
161
|
+
if SORT_ID in unique_columns:
|
|
162
|
+
unique_columns.remove(SORT_ID)
|
|
163
|
+
if EVAL_SET_INDEX in unique_columns:
|
|
164
|
+
unique_columns.remove(EVAL_SET_INDEX)
|
|
165
|
+
logger.info(f"Dataset shape before clean duplicates: {df.shape}")
|
|
166
|
+
# Train segment goes first so if duplicates are found in train and eval set
|
|
167
|
+
# then we keep unique rows in train segment
|
|
168
|
+
df = df.drop_duplicates(subset=unique_columns, keep="first")
|
|
169
|
+
logger.info(f"Dataset shape after clean duplicates: {df.shape}")
|
|
170
|
+
nrows_after_full_dedup = len(df)
|
|
171
|
+
share_full_dedup = 100 * (1 - nrows_after_full_dedup / nrows)
|
|
172
|
+
if share_full_dedup > 0:
|
|
173
|
+
msg = bundle.get("dataset_full_duplicates").format(share_full_dedup)
|
|
174
|
+
logger.warning(msg)
|
|
175
|
+
# if not silent_mode:
|
|
176
|
+
# print(msg)
|
|
177
|
+
# self.warning_counter.increment()
|
|
178
|
+
if TARGET in df.columns:
|
|
179
|
+
unique_columns.remove(TARGET)
|
|
180
|
+
marked_duplicates = df.duplicated(subset=unique_columns, keep=False)
|
|
181
|
+
if marked_duplicates.sum() > 0:
|
|
182
|
+
dups_indices = df[marked_duplicates].index.to_list()
|
|
183
|
+
nrows_after_tgt_dedup = len(df.drop_duplicates(subset=unique_columns, keep=False))
|
|
184
|
+
num_dup_rows = nrows_after_full_dedup - nrows_after_tgt_dedup
|
|
185
|
+
share_tgt_dedup = 100 * num_dup_rows / nrows_after_full_dedup
|
|
186
|
+
|
|
187
|
+
msg = bundle.get("dataset_diff_target_duplicates").format(share_tgt_dedup, num_dup_rows, dups_indices)
|
|
188
|
+
logger.warning(msg)
|
|
189
|
+
if not silent:
|
|
190
|
+
print(msg)
|
|
191
|
+
df = df.drop_duplicates(subset=unique_columns, keep=False)
|
|
192
|
+
logger.info(f"Dataset shape after clean invalid target duplicates: {df.shape}")
|
|
193
|
+
|
|
194
|
+
return df
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
def _get_column_by_key(search_keys: Dict[str, SearchKey], keys: Union[SearchKey, List[SearchKey]]) -> Optional[str]:
|
|
198
|
+
for col, key_type in search_keys.items():
|
|
199
|
+
if (isinstance(keys, list) and key_type in keys) or key_type == keys:
|
|
200
|
+
return col
|
|
@@ -56,25 +56,25 @@ class FeaturesValidator:
|
|
|
56
56
|
# self.logger.warning(msg)
|
|
57
57
|
# warning_counter.increment()
|
|
58
58
|
|
|
59
|
+
columns_renaming = columns_renaming or {}
|
|
60
|
+
|
|
59
61
|
if empty_or_constant_features:
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
display_names = empty_or_constant_features
|
|
64
|
-
msg = bundle.get("empty_or_contant_features").format(display_names)
|
|
62
|
+
msg = bundle.get("empty_or_contant_features").format(
|
|
63
|
+
[columns_renaming.get(f, f) for f in empty_or_constant_features]
|
|
64
|
+
)
|
|
65
65
|
print(msg)
|
|
66
66
|
self.logger.warning(msg)
|
|
67
67
|
warning_counter.increment()
|
|
68
68
|
|
|
69
69
|
high_cardinality_features = self.find_high_cardinality(df[features])
|
|
70
70
|
if features_for_generate:
|
|
71
|
-
high_cardinality_features = [
|
|
71
|
+
high_cardinality_features = [
|
|
72
|
+
f for f in high_cardinality_features if columns_renaming.get(f, f) not in features_for_generate
|
|
73
|
+
]
|
|
72
74
|
if high_cardinality_features:
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
display_names = empty_or_constant_features
|
|
77
|
-
msg = bundle.get("high_cardinality_features").format(display_names)
|
|
75
|
+
msg = bundle.get("high_cardinality_features").format(
|
|
76
|
+
[columns_renaming.get(f, f) for f in high_cardinality_features]
|
|
77
|
+
)
|
|
78
78
|
print(msg)
|
|
79
79
|
self.logger.warning(msg)
|
|
80
80
|
warning_counter.increment()
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
__version__ = "1.2.11"
|
|
@@ -1,195 +0,0 @@
|
|
|
1
|
-
from logging import Logger
|
|
2
|
-
from typing import Dict, List, Optional, Union
|
|
3
|
-
|
|
4
|
-
import pandas as pd
|
|
5
|
-
|
|
6
|
-
from upgini.metadata import (
|
|
7
|
-
ENTITY_SYSTEM_RECORD_ID,
|
|
8
|
-
EVAL_SET_INDEX,
|
|
9
|
-
SORT_ID,
|
|
10
|
-
SYSTEM_RECORD_ID,
|
|
11
|
-
TARGET,
|
|
12
|
-
ModelTaskType,
|
|
13
|
-
SearchKey,
|
|
14
|
-
)
|
|
15
|
-
from upgini.resource_bundle import ResourceBundle
|
|
16
|
-
from upgini.utils.datetime_utils import DateTimeSearchKeyConverter
|
|
17
|
-
from upgini.utils.target_utils import define_task
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
def remove_fintech_duplicates(
|
|
21
|
-
df: pd.DataFrame,
|
|
22
|
-
search_keys: Dict[str, SearchKey],
|
|
23
|
-
date_format: Optional[str] = None,
|
|
24
|
-
logger: Optional[Logger] = None,
|
|
25
|
-
silent=False,
|
|
26
|
-
bundle: ResourceBundle = None,
|
|
27
|
-
) -> pd.DataFrame:
|
|
28
|
-
# Base checks
|
|
29
|
-
date_col = _get_column_by_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
|
|
30
|
-
if define_task(df[TARGET], date_col is not None, silent=True) != ModelTaskType.BINARY:
|
|
31
|
-
return df
|
|
32
|
-
|
|
33
|
-
date_col = _get_column_by_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
|
|
34
|
-
if date_col is None:
|
|
35
|
-
return df
|
|
36
|
-
|
|
37
|
-
personal_cols = []
|
|
38
|
-
phone_col = _get_column_by_key(search_keys, SearchKey.PHONE)
|
|
39
|
-
if phone_col:
|
|
40
|
-
personal_cols.append(phone_col)
|
|
41
|
-
email_col = _get_column_by_key(search_keys, SearchKey.EMAIL)
|
|
42
|
-
if email_col:
|
|
43
|
-
personal_cols.append(email_col)
|
|
44
|
-
hem_col = _get_column_by_key(search_keys, SearchKey.HEM)
|
|
45
|
-
if hem_col:
|
|
46
|
-
personal_cols.append(hem_col)
|
|
47
|
-
if len(personal_cols) == 0:
|
|
48
|
-
return df
|
|
49
|
-
|
|
50
|
-
sub_df = df[personal_cols + [date_col, TARGET]]
|
|
51
|
-
|
|
52
|
-
# Fast check for duplicates by personal keys
|
|
53
|
-
if not sub_df[personal_cols].duplicated().any():
|
|
54
|
-
return df
|
|
55
|
-
|
|
56
|
-
grouped_by_personal_cols = sub_df.groupby(personal_cols, group_keys=False)
|
|
57
|
-
|
|
58
|
-
# counts of diff dates by set of personal keys
|
|
59
|
-
uniques = grouped_by_personal_cols[date_col].nunique()
|
|
60
|
-
total = len(uniques)
|
|
61
|
-
diff_dates = len(uniques[uniques > 1])
|
|
62
|
-
if diff_dates / total >= 0.6:
|
|
63
|
-
return df
|
|
64
|
-
|
|
65
|
-
# Additional checks
|
|
66
|
-
|
|
67
|
-
duplicates = sub_df.duplicated(personal_cols, keep=False)
|
|
68
|
-
duplicate_rows = sub_df[duplicates]
|
|
69
|
-
if len(duplicate_rows) == 0:
|
|
70
|
-
return df
|
|
71
|
-
|
|
72
|
-
# if there is no different target values in personal keys duplicate rows
|
|
73
|
-
nonunique_target_groups = grouped_by_personal_cols[TARGET].nunique() > 1
|
|
74
|
-
if nonunique_target_groups.sum() == 0:
|
|
75
|
-
return df
|
|
76
|
-
|
|
77
|
-
def has_diff_target_within_60_days(rows):
|
|
78
|
-
rows = rows.sort_values(by=date_col)
|
|
79
|
-
return len(rows[rows[TARGET].ne(rows[TARGET].shift()) & (rows[date_col].diff() < 60 * 24 * 60 * 60 * 1000)]) > 0
|
|
80
|
-
|
|
81
|
-
nonunique_target_rows = nonunique_target_groups[nonunique_target_groups].reset_index().drop(columns=TARGET)
|
|
82
|
-
sub_df = pd.merge(sub_df, nonunique_target_rows, on=personal_cols)
|
|
83
|
-
|
|
84
|
-
sub_df = DateTimeSearchKeyConverter(date_col, date_format=date_format, logger=logger, bundle=bundle).convert(sub_df)
|
|
85
|
-
grouped_by_personal_cols = sub_df.groupby(personal_cols, group_keys=False)
|
|
86
|
-
rows_with_diff_target = grouped_by_personal_cols.filter(has_diff_target_within_60_days)
|
|
87
|
-
if len(rows_with_diff_target) > 0:
|
|
88
|
-
unique_keys_to_delete = rows_with_diff_target[personal_cols].drop_duplicates()
|
|
89
|
-
if EVAL_SET_INDEX not in df.columns:
|
|
90
|
-
rows_to_remove = pd.merge(df.reset_index(), unique_keys_to_delete, on=personal_cols)
|
|
91
|
-
rows_to_remove = rows_to_remove.set_index(df.index.name or "index")
|
|
92
|
-
perc = len(rows_to_remove) * 100 / len(df)
|
|
93
|
-
msg = bundle.get("dataset_train_diff_target_duplicates_fintech").format(
|
|
94
|
-
perc, len(rows_to_remove), rows_to_remove.index.to_list()
|
|
95
|
-
)
|
|
96
|
-
if not silent:
|
|
97
|
-
print(msg)
|
|
98
|
-
if logger:
|
|
99
|
-
logger.warning(msg)
|
|
100
|
-
logger.info(f"Dataset shape before clean fintech duplicates: {df.shape}")
|
|
101
|
-
df = df[~df.index.isin(rows_to_remove.index)]
|
|
102
|
-
logger.info(f"Dataset shape after clean fintech duplicates: {df.shape}")
|
|
103
|
-
else:
|
|
104
|
-
# Indices in train and eval_set can be the same so we remove rows from them separately
|
|
105
|
-
train = df.query(f"{EVAL_SET_INDEX} == 0")
|
|
106
|
-
train_rows_to_remove = pd.merge(train.reset_index(), unique_keys_to_delete, on=personal_cols)
|
|
107
|
-
train_rows_to_remove = train_rows_to_remove.set_index(train.index.name or "index")
|
|
108
|
-
train_perc = len(train_rows_to_remove) * 100 / len(train)
|
|
109
|
-
msg = bundle.get("dataset_train_diff_target_duplicates_fintech").format(
|
|
110
|
-
train_perc, len(train_rows_to_remove), train_rows_to_remove.index.to_list()
|
|
111
|
-
)
|
|
112
|
-
if not silent:
|
|
113
|
-
print(msg)
|
|
114
|
-
if logger:
|
|
115
|
-
logger.warning(msg)
|
|
116
|
-
logger.info(f"Train dataset shape before clean fintech duplicates: {train.shape}")
|
|
117
|
-
train = train[~train.index.isin(train_rows_to_remove.index)]
|
|
118
|
-
logger.info(f"Train dataset shape after clean fintech duplicates: {train.shape}")
|
|
119
|
-
|
|
120
|
-
evals = [df.query(f"{EVAL_SET_INDEX} == {i}") for i in df[EVAL_SET_INDEX].unique() if i != 0]
|
|
121
|
-
new_evals = []
|
|
122
|
-
for i, eval in enumerate(evals):
|
|
123
|
-
eval_rows_to_remove = pd.merge(eval.reset_index(), unique_keys_to_delete, on=personal_cols)
|
|
124
|
-
eval_rows_to_remove = eval_rows_to_remove.set_index(eval.index.name or "index")
|
|
125
|
-
eval_perc = len(eval_rows_to_remove) * 100 / len(eval)
|
|
126
|
-
msg = bundle.get("dataset_eval_diff_target_duplicates_fintech").format(
|
|
127
|
-
eval_perc, len(eval_rows_to_remove), i + 1, eval_rows_to_remove.index.to_list()
|
|
128
|
-
)
|
|
129
|
-
if not silent:
|
|
130
|
-
print(msg)
|
|
131
|
-
if logger:
|
|
132
|
-
logger.warning(msg)
|
|
133
|
-
logger.info(f"Eval {i + 1} dataset shape before clean fintech duplicates: {eval.shape}")
|
|
134
|
-
eval = eval[~eval.index.isin(eval_rows_to_remove.index)]
|
|
135
|
-
logger.info(f"Eval {i + 1} dataset shape after clean fintech duplicates: {eval.shape}")
|
|
136
|
-
new_evals.append(eval)
|
|
137
|
-
|
|
138
|
-
logger.info(f"Dataset shape before clean fintech duplicates: {df.shape}")
|
|
139
|
-
df = pd.concat([train] + new_evals)
|
|
140
|
-
logger.info(f"Dataset shape after clean fintech duplicates: {df.shape}")
|
|
141
|
-
return df
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
def clean_full_duplicates(
|
|
145
|
-
df: pd.DataFrame, logger: Optional[Logger] = None, silent=False, bundle: ResourceBundle = None
|
|
146
|
-
) -> pd.DataFrame:
|
|
147
|
-
nrows = len(df)
|
|
148
|
-
if nrows == 0:
|
|
149
|
-
return df
|
|
150
|
-
# Remove full duplicates (exclude system_record_id, sort_id and eval_set_index)
|
|
151
|
-
unique_columns = df.columns.tolist()
|
|
152
|
-
if SYSTEM_RECORD_ID in unique_columns:
|
|
153
|
-
unique_columns.remove(SYSTEM_RECORD_ID)
|
|
154
|
-
if ENTITY_SYSTEM_RECORD_ID in unique_columns:
|
|
155
|
-
unique_columns.remove(ENTITY_SYSTEM_RECORD_ID)
|
|
156
|
-
if SORT_ID in unique_columns:
|
|
157
|
-
unique_columns.remove(SORT_ID)
|
|
158
|
-
if EVAL_SET_INDEX in unique_columns:
|
|
159
|
-
unique_columns.remove(EVAL_SET_INDEX)
|
|
160
|
-
logger.info(f"Dataset shape before clean duplicates: {df.shape}")
|
|
161
|
-
# Train segment goes first so if duplicates are found in train and eval set
|
|
162
|
-
# then we keep unique rows in train segment
|
|
163
|
-
df = df.drop_duplicates(subset=unique_columns, keep="first")
|
|
164
|
-
logger.info(f"Dataset shape after clean duplicates: {df.shape}")
|
|
165
|
-
nrows_after_full_dedup = len(df)
|
|
166
|
-
share_full_dedup = 100 * (1 - nrows_after_full_dedup / nrows)
|
|
167
|
-
if share_full_dedup > 0:
|
|
168
|
-
msg = bundle.get("dataset_full_duplicates").format(share_full_dedup)
|
|
169
|
-
logger.warning(msg)
|
|
170
|
-
# if not silent_mode:
|
|
171
|
-
# print(msg)
|
|
172
|
-
# self.warning_counter.increment()
|
|
173
|
-
if TARGET in df.columns:
|
|
174
|
-
unique_columns.remove(TARGET)
|
|
175
|
-
marked_duplicates = df.duplicated(subset=unique_columns, keep=False)
|
|
176
|
-
if marked_duplicates.sum() > 0:
|
|
177
|
-
dups_indices = df[marked_duplicates].index.to_list()
|
|
178
|
-
nrows_after_tgt_dedup = len(df.drop_duplicates(subset=unique_columns, keep=False))
|
|
179
|
-
num_dup_rows = nrows_after_full_dedup - nrows_after_tgt_dedup
|
|
180
|
-
share_tgt_dedup = 100 * num_dup_rows / nrows_after_full_dedup
|
|
181
|
-
|
|
182
|
-
msg = bundle.get("dataset_diff_target_duplicates").format(share_tgt_dedup, num_dup_rows, dups_indices)
|
|
183
|
-
logger.warning(msg)
|
|
184
|
-
if not silent:
|
|
185
|
-
print(msg)
|
|
186
|
-
df = df.drop_duplicates(subset=unique_columns, keep=False)
|
|
187
|
-
logger.info(f"Dataset shape after clean invalid target duplicates: {df.shape}")
|
|
188
|
-
|
|
189
|
-
return df
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
def _get_column_by_key(search_keys: Dict[str, SearchKey], keys: Union[SearchKey, List[SearchKey]]) -> Optional[str]:
|
|
193
|
-
for col, key_type in search_keys.items():
|
|
194
|
-
if (isinstance(keys, list) and key_type in keys) or key_type == keys:
|
|
195
|
-
return col
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|