upgini 1.1.244a13__py3-none-any.whl → 1.1.244a15__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- upgini/dataset.py +11 -13
- upgini/features_enricher.py +10 -1
- upgini/metrics.py +4 -1
- upgini/utils/deduplicate_utils.py +65 -10
- {upgini-1.1.244a13.dist-info → upgini-1.1.244a15.dist-info}/METADATA +1 -1
- {upgini-1.1.244a13.dist-info → upgini-1.1.244a15.dist-info}/RECORD +9 -9
- {upgini-1.1.244a13.dist-info → upgini-1.1.244a15.dist-info}/LICENSE +0 -0
- {upgini-1.1.244a13.dist-info → upgini-1.1.244a15.dist-info}/WHEEL +0 -0
- {upgini-1.1.244a13.dist-info → upgini-1.1.244a15.dist-info}/top_level.txt +0 -0
upgini/dataset.py
CHANGED
|
@@ -36,14 +36,12 @@ from upgini.metadata import (
|
|
|
36
36
|
NumericInterval,
|
|
37
37
|
RuntimeParameters,
|
|
38
38
|
SearchCustomization,
|
|
39
|
-
SearchKey,
|
|
40
39
|
)
|
|
41
40
|
from upgini.normalizer.phone_normalizer import PhoneNormalizer
|
|
42
41
|
from upgini.resource_bundle import bundle
|
|
43
42
|
from upgini.sampler.random_under_sampler import RandomUnderSampler
|
|
44
43
|
from upgini.search_task import SearchTask
|
|
45
44
|
from upgini.utils import combine_search_keys
|
|
46
|
-
from upgini.utils.deduplicate_utils import remove_fintech_duplicates
|
|
47
45
|
from upgini.utils.email_utils import EmailSearchKeyConverter
|
|
48
46
|
|
|
49
47
|
try:
|
|
@@ -817,19 +815,19 @@ class Dataset: # (pd.DataFrame):
|
|
|
817
815
|
|
|
818
816
|
self.__convert_features_types()
|
|
819
817
|
|
|
820
|
-
search_keys = {
|
|
821
|
-
|
|
822
|
-
|
|
823
|
-
|
|
824
|
-
}
|
|
818
|
+
# search_keys = {
|
|
819
|
+
# col: SearchKey.from_meaning_type(key_type)
|
|
820
|
+
# for col, key_type in self.meaning_types.items()
|
|
821
|
+
# if SearchKey.from_meaning_type(key_type) is not None
|
|
822
|
+
# }
|
|
825
823
|
|
|
826
|
-
if validate_target:
|
|
827
|
-
|
|
828
|
-
else:
|
|
829
|
-
|
|
824
|
+
# if validate_target:
|
|
825
|
+
# need_full_defuplication, self.data = remove_fintech_duplicates(self.data, search_keys, self.logger)
|
|
826
|
+
# else:
|
|
827
|
+
# need_full_defuplication = True
|
|
830
828
|
|
|
831
|
-
if need_full_defuplication:
|
|
832
|
-
|
|
829
|
+
# if need_full_defuplication:
|
|
830
|
+
# self.__clean_duplicates(silent_mode)
|
|
833
831
|
|
|
834
832
|
self.__validate_dataset(validate_target, silent_mode)
|
|
835
833
|
|
upgini/features_enricher.py
CHANGED
|
@@ -65,7 +65,10 @@ from upgini.utils.datetime_utils import (
|
|
|
65
65
|
is_blocked_time_series,
|
|
66
66
|
is_time_series,
|
|
67
67
|
)
|
|
68
|
-
from upgini.utils.deduplicate_utils import
|
|
68
|
+
from upgini.utils.deduplicate_utils import (
|
|
69
|
+
clean_full_duplicates,
|
|
70
|
+
remove_fintech_duplicates,
|
|
71
|
+
)
|
|
69
72
|
from upgini.utils.display_utils import (
|
|
70
73
|
display_html_dataframe,
|
|
71
74
|
do_without_pandas_limits,
|
|
@@ -1850,6 +1853,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1850
1853
|
)
|
|
1851
1854
|
meaning_types[SYSTEM_RECORD_ID] = FileColumnMeaningType.SYSTEM_RECORD_ID
|
|
1852
1855
|
|
|
1856
|
+
df = clean_full_duplicates(df, self.logger, silent=silent_mode)
|
|
1857
|
+
|
|
1853
1858
|
df = df.reset_index(drop=True)
|
|
1854
1859
|
system_columns_with_original_index = [SYSTEM_RECORD_ID] + generated_features
|
|
1855
1860
|
df_with_original_index = df[system_columns_with_original_index].copy()
|
|
@@ -2131,6 +2136,10 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2131
2136
|
|
|
2132
2137
|
df = self.__add_country_code(df, self.fit_search_keys)
|
|
2133
2138
|
|
|
2139
|
+
need_full_defuplication, df = remove_fintech_duplicates(df, self.fit_search_keys, self.logger)
|
|
2140
|
+
if need_full_defuplication:
|
|
2141
|
+
df = clean_full_duplicates(df, self.logger)
|
|
2142
|
+
|
|
2134
2143
|
date_column = self._get_date_column(self.fit_search_keys)
|
|
2135
2144
|
self.__adjust_cv(df, date_column, model_task_type)
|
|
2136
2145
|
|
upgini/metrics.py
CHANGED
|
@@ -203,6 +203,7 @@ class EstimatorWrapper:
|
|
|
203
203
|
add_params: Optional[Dict[str, Any]] = None,
|
|
204
204
|
groups: Optional[np.ndarray] = None,
|
|
205
205
|
text_features: Optional[List[str]] = None,
|
|
206
|
+
logger: Optional[logging.Logger] = None,
|
|
206
207
|
):
|
|
207
208
|
self.estimator = estimator
|
|
208
209
|
self.scorer = scorer
|
|
@@ -216,6 +217,7 @@ class EstimatorWrapper:
|
|
|
216
217
|
self.cv_estimators = None
|
|
217
218
|
self.groups = groups
|
|
218
219
|
self.text_features = text_features
|
|
220
|
+
self.logger = logger or logging.getLogger()
|
|
219
221
|
|
|
220
222
|
def fit(self, X: pd.DataFrame, y: np.ndarray, **kwargs):
|
|
221
223
|
X, y, _, fit_params = self._prepare_to_fit(X, y)
|
|
@@ -411,13 +413,14 @@ class CatBoostWrapper(EstimatorWrapper):
|
|
|
411
413
|
emb_pattern = r"(.+)_emb\d+"
|
|
412
414
|
self.emb_features = [c for c in X.columns if re.match(emb_pattern, c) and is_numeric_dtype(X[c])]
|
|
413
415
|
embedding_features = []
|
|
414
|
-
if len(self.emb_features) >
|
|
416
|
+
if len(self.emb_features) > 3: # There is no reason to reduce embeddings dimension with less than 4
|
|
415
417
|
X, embedding_features = self.group_embeddings(X)
|
|
416
418
|
params["embedding_features"] = embedding_features
|
|
417
419
|
|
|
418
420
|
# Find text features from passed in generate_features
|
|
419
421
|
if self.text_features is not None:
|
|
420
422
|
self.text_features = [f for f in self.text_features if not is_numeric_dtype(X[f])]
|
|
423
|
+
|
|
421
424
|
params["text_features"] = self.text_features
|
|
422
425
|
|
|
423
426
|
# Find rest categorical features
|
|
@@ -3,7 +3,7 @@ from typing import Dict, List, Optional, Tuple, Union
|
|
|
3
3
|
|
|
4
4
|
import pandas as pd
|
|
5
5
|
|
|
6
|
-
from upgini.metadata import TARGET, ModelTaskType, SearchKey
|
|
6
|
+
from upgini.metadata import SYSTEM_RECORD_ID, TARGET, ModelTaskType, SearchKey
|
|
7
7
|
from upgini.resource_bundle import bundle
|
|
8
8
|
from upgini.utils.datetime_utils import DateTimeSearchKeyConverter
|
|
9
9
|
from upgini.utils.target_utils import define_task
|
|
@@ -35,8 +35,15 @@ def remove_fintech_duplicates(
|
|
|
35
35
|
if len(personal_cols) == 0:
|
|
36
36
|
return need_full_deduplication, df
|
|
37
37
|
|
|
38
|
-
|
|
38
|
+
sub_df = df[personal_cols + [date_col, TARGET]]
|
|
39
39
|
|
|
40
|
+
# Fast check for duplicates by personal keys
|
|
41
|
+
if not sub_df[personal_cols].duplicated().any():
|
|
42
|
+
return need_full_deduplication, df
|
|
43
|
+
|
|
44
|
+
grouped_by_personal_cols = sub_df.groupby(personal_cols, group_keys=False)
|
|
45
|
+
|
|
46
|
+
# counts of diff dates by set of personal keys
|
|
40
47
|
uniques = grouped_by_personal_cols[date_col].nunique()
|
|
41
48
|
total = len(uniques)
|
|
42
49
|
diff_dates = len(uniques[uniques > 1])
|
|
@@ -47,35 +54,83 @@ def remove_fintech_duplicates(
|
|
|
47
54
|
|
|
48
55
|
need_full_deduplication = False
|
|
49
56
|
|
|
50
|
-
duplicates =
|
|
51
|
-
duplicate_rows =
|
|
57
|
+
duplicates = sub_df.duplicated(personal_cols, keep=False)
|
|
58
|
+
duplicate_rows = sub_df[duplicates]
|
|
52
59
|
if len(duplicate_rows) == 0:
|
|
53
60
|
return need_full_deduplication, df
|
|
54
61
|
|
|
55
|
-
if
|
|
62
|
+
# if there is no different target values in personal keys duplicate rows
|
|
63
|
+
nonunique_target_groups = grouped_by_personal_cols[TARGET].nunique() > 1
|
|
64
|
+
if nonunique_target_groups.sum() == 0:
|
|
56
65
|
return need_full_deduplication, df
|
|
57
66
|
|
|
58
67
|
def has_diff_target_within_60_days(rows):
|
|
59
68
|
rows = rows.sort_values(by=date_col)
|
|
60
69
|
return len(rows[rows[TARGET].ne(rows[TARGET].shift()) & (rows[date_col].diff() < 60 * 24 * 60 * 60 * 1000)]) > 0
|
|
61
70
|
|
|
62
|
-
|
|
63
|
-
|
|
71
|
+
nonunique_target_rows = nonunique_target_groups[nonunique_target_groups].reset_index().drop(columns=TARGET)
|
|
72
|
+
sub_df = pd.merge(sub_df, nonunique_target_rows, on=personal_cols)
|
|
73
|
+
|
|
74
|
+
sub_df = DateTimeSearchKeyConverter(date_col).convert(sub_df)
|
|
75
|
+
grouped_by_personal_cols = sub_df.groupby(personal_cols, group_keys=False)
|
|
64
76
|
rows_with_diff_target = grouped_by_personal_cols.filter(has_diff_target_within_60_days)
|
|
65
77
|
if len(rows_with_diff_target) > 0:
|
|
66
|
-
|
|
78
|
+
unique_keys_to_delete = rows_with_diff_target[personal_cols].drop_duplicates()
|
|
79
|
+
rows_to_remove = pd.merge(df.reset_index(), unique_keys_to_delete, on=personal_cols)
|
|
80
|
+
rows_to_remove = rows_to_remove.set_index(df.index.name or "index")
|
|
81
|
+
perc = len(rows_to_remove) * 100 / len(df)
|
|
67
82
|
msg = bundle.get("dataset_diff_target_duplicates_fintech").format(
|
|
68
|
-
perc, len(
|
|
83
|
+
perc, len(rows_to_remove), rows_to_remove.index.to_list()
|
|
69
84
|
)
|
|
70
85
|
if not silent:
|
|
71
86
|
print(msg)
|
|
72
87
|
if logger:
|
|
73
88
|
logger.warning(msg)
|
|
74
|
-
|
|
89
|
+
logger.info(f"Dataset shape before clean fintech duplicates: {df.shape}")
|
|
90
|
+
df = df[~df.index.isin(rows_to_remove.index)]
|
|
91
|
+
logger.info(f"Dataset shape after clean fintech duplicates: {df.shape}")
|
|
75
92
|
|
|
76
93
|
return need_full_deduplication, df
|
|
77
94
|
|
|
78
95
|
|
|
96
|
+
def clean_full_duplicates(
|
|
97
|
+
df: pd.DataFrame, logger: Optional[Logger] = None, silent=False
|
|
98
|
+
) -> pd.DataFrame:
|
|
99
|
+
nrows = len(df)
|
|
100
|
+
if nrows == 0:
|
|
101
|
+
return
|
|
102
|
+
# Remove absolute duplicates (exclude system_record_id)
|
|
103
|
+
unique_columns = df.columns.tolist()
|
|
104
|
+
if SYSTEM_RECORD_ID in unique_columns:
|
|
105
|
+
unique_columns.remove(SYSTEM_RECORD_ID)
|
|
106
|
+
logger.info(f"Dataset shape before clean duplicates: {df.shape}")
|
|
107
|
+
df = df.drop_duplicates(subset=unique_columns)
|
|
108
|
+
logger.info(f"Dataset shape after clean duplicates: {df.shape}")
|
|
109
|
+
nrows_after_full_dedup = len(df)
|
|
110
|
+
share_full_dedup = 100 * (1 - nrows_after_full_dedup / nrows)
|
|
111
|
+
if share_full_dedup > 0:
|
|
112
|
+
msg = bundle.get("dataset_full_duplicates").format(share_full_dedup)
|
|
113
|
+
logger.warning(msg)
|
|
114
|
+
# if not silent_mode:
|
|
115
|
+
# print(msg)
|
|
116
|
+
# self.warning_counter.increment()
|
|
117
|
+
if TARGET in df.columns:
|
|
118
|
+
unique_columns.remove(TARGET)
|
|
119
|
+
marked_duplicates = df.duplicated(subset=unique_columns, keep=False)
|
|
120
|
+
if marked_duplicates.sum() > 0:
|
|
121
|
+
dups_indices = df[marked_duplicates].index.to_list()
|
|
122
|
+
nrows_after_tgt_dedup = len(df.drop_duplicates(subset=unique_columns))
|
|
123
|
+
num_dup_rows = nrows_after_full_dedup - nrows_after_tgt_dedup
|
|
124
|
+
share_tgt_dedup = 100 * num_dup_rows / nrows_after_full_dedup
|
|
125
|
+
|
|
126
|
+
msg = bundle.get("dataset_diff_target_duplicates").format(share_tgt_dedup, num_dup_rows, dups_indices)
|
|
127
|
+
logger.warning(msg)
|
|
128
|
+
if not silent:
|
|
129
|
+
print(msg)
|
|
130
|
+
df = df.drop_duplicates(subset=unique_columns, keep=False)
|
|
131
|
+
logger.info(f"Dataset shape after clean invalid target duplicates: {df.shape}")
|
|
132
|
+
|
|
133
|
+
|
|
79
134
|
def _get_column_by_key(search_keys: Dict[str, SearchKey], keys: Union[SearchKey, List[SearchKey]]) -> Optional[str]:
|
|
80
135
|
for col, key_type in search_keys.items():
|
|
81
136
|
if (isinstance(keys, list) and key_type in keys) or key_type == keys:
|
|
@@ -1,12 +1,12 @@
|
|
|
1
1
|
upgini/__init__.py,sha256=asENHgEVHQBIkV-e_0IhE_ZWqkCG6398U3ZLrNzAH6k,407
|
|
2
2
|
upgini/ads.py,sha256=mre6xn44wcC_fg63iLT_kTh4mViZqR9AKRJZAtpQz8Y,2592
|
|
3
|
-
upgini/dataset.py,sha256=
|
|
3
|
+
upgini/dataset.py,sha256=WGpnmpnmfdyB2DAwaj7mkk2s0e-6Z6bg5BWj1lUE2p0,49960
|
|
4
4
|
upgini/errors.py,sha256=pdzQl3MKuK52yvncxMWMRWeSIOGhUFzpQoszoRFBOk0,958
|
|
5
|
-
upgini/features_enricher.py,sha256=
|
|
5
|
+
upgini/features_enricher.py,sha256=a3RBqMMxY3lH6bkvc20I3zyL5oQF3VUDUIvwqgtzxxA,165592
|
|
6
6
|
upgini/fingerprint.js,sha256=VygVIQlN1v4NGZfjHqtRogOw8zjTnnMNJg_f7M5iGQU,33442
|
|
7
7
|
upgini/http.py,sha256=eSG4gOpmCGlXmB6KIPNzAG8tRZNUjyYpMeUeHw_2li4,42264
|
|
8
8
|
upgini/metadata.py,sha256=55t0uQI910tzTcnwxZCUL1413BhTiSm8oqiwp-94NyA,9613
|
|
9
|
-
upgini/metrics.py,sha256=
|
|
9
|
+
upgini/metrics.py,sha256=BCEotBr4_PCfUheswZ_FPAj6Lk_P-iyl9Qfi8WqdbqY,26136
|
|
10
10
|
upgini/search_task.py,sha256=5n4qGJmtu48s0-FHAtF3L5qVLMd1JVW3FJlM8dFbh-s,17063
|
|
11
11
|
upgini/spinner.py,sha256=Dm1dQ5F_z_Ua2odLxZX7OypcOX9tSx_vE5MGaKtUmfw,1118
|
|
12
12
|
upgini/version_validator.py,sha256=rDIncP6BEko4J2F2hUcMOtKm_vZbI4ICWcNcw8hrwM4,1400
|
|
@@ -40,7 +40,7 @@ upgini/utils/country_utils.py,sha256=1KXhLSNqkNYVL3on8-zK0Arc_SspUH7AMZvGZICysOU
|
|
|
40
40
|
upgini/utils/custom_loss_utils.py,sha256=DBslpjWGPt7xTeypt78baR59012SYphbPsO_YLKdilo,3972
|
|
41
41
|
upgini/utils/cv_utils.py,sha256=6pSSL_Ft_8C6n6aInJeiyeSBD7McjsMxKZpHqSBV0uY,2491
|
|
42
42
|
upgini/utils/datetime_utils.py,sha256=awsLpnFjBNcrsCDyyiiJLicHgHiGCNAwi0UOwRKGD7s,8645
|
|
43
|
-
upgini/utils/deduplicate_utils.py,sha256=
|
|
43
|
+
upgini/utils/deduplicate_utils.py,sha256=OxJ3ygvRQL5H_h2Kn0mwRaj5Ux8FmCQ8ZV4YQvSRZyw,5794
|
|
44
44
|
upgini/utils/display_utils.py,sha256=tiq5sFOfMwkKCjQ7OGdyK_twe0Qdr9F3mzkW1QXSDog,10664
|
|
45
45
|
upgini/utils/email_utils.py,sha256=3CvHXTSzlgLyGsQOXfRYVfFhfPy6OXG4uXOBWRaLfHg,3479
|
|
46
46
|
upgini/utils/fallback_progress_bar.py,sha256=cdbd1XGcWm4Ed4eAqV2_St3z7uC_kkH22gEyrN5ub6M,1090
|
|
@@ -54,8 +54,8 @@ upgini/utils/sklearn_ext.py,sha256=fvuTWJ5AnT3ED9KSaQu_yIgW2JR19hFlaGDoVP3k60g,4
|
|
|
54
54
|
upgini/utils/target_utils.py,sha256=qyj-bGsIEl9X2Vc5gwXtsuRaocvId8bn46F7mZ9dy9A,1707
|
|
55
55
|
upgini/utils/track_info.py,sha256=EPcJ13Jqa17_T0JjM37Ac9kWDz5Zk0GVsIZKutOb8aU,5207
|
|
56
56
|
upgini/utils/warning_counter.py,sha256=dIWBB4dI5XRRJZudvIlqlIYKEiwLLPcXarsZuYRt338,227
|
|
57
|
-
upgini-1.1.
|
|
58
|
-
upgini-1.1.
|
|
59
|
-
upgini-1.1.
|
|
60
|
-
upgini-1.1.
|
|
61
|
-
upgini-1.1.
|
|
57
|
+
upgini-1.1.244a15.dist-info/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
|
|
58
|
+
upgini-1.1.244a15.dist-info/METADATA,sha256=GslYuWCFvWkrO6G5g88d5yIzv5nqe4OdOqyVHHEKO0k,48265
|
|
59
|
+
upgini-1.1.244a15.dist-info/WHEEL,sha256=pkctZYzUS4AYVn6dJ-7367OJZivF2e8RA9b_ZBjif18,92
|
|
60
|
+
upgini-1.1.244a15.dist-info/top_level.txt,sha256=OFhTGiDIWKl5gFI49qvWq1R9IKflPaE2PekcbDXDtx4,7
|
|
61
|
+
upgini-1.1.244a15.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|