upgini 1.1.264__py3-none-any.whl → 1.1.265__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- upgini/features_enricher.py +22 -8
- upgini/resource_bundle/strings.properties +2 -1
- upgini/utils/deduplicate_utils.py +61 -18
- {upgini-1.1.264.dist-info → upgini-1.1.265.dist-info}/METADATA +1 -1
- {upgini-1.1.264.dist-info → upgini-1.1.265.dist-info}/RECORD +8 -8
- {upgini-1.1.264.dist-info → upgini-1.1.265.dist-info}/LICENSE +0 -0
- {upgini-1.1.264.dist-info → upgini-1.1.265.dist-info}/WHEEL +0 -0
- {upgini-1.1.264.dist-info → upgini-1.1.265.dist-info}/top_level.txt +0 -0
upgini/features_enricher.py
CHANGED
|
@@ -1686,6 +1686,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1686
1686
|
df = validated_X.copy()
|
|
1687
1687
|
|
|
1688
1688
|
df[TARGET] = validated_y
|
|
1689
|
+
|
|
1690
|
+
df = clean_full_duplicates(df, logger=self.logger, silent=True, bundle=self.bundle)
|
|
1691
|
+
|
|
1689
1692
|
num_samples = _num_samples(df)
|
|
1690
1693
|
if num_samples > Dataset.FIT_SAMPLE_THRESHOLD:
|
|
1691
1694
|
self.logger.info(f"Downsampling from {num_samples} to {Dataset.FIT_SAMPLE_ROWS}")
|
|
@@ -1920,6 +1923,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1920
1923
|
|
|
1921
1924
|
meaning_types = {col: key.value for col, key in search_keys.items()}
|
|
1922
1925
|
non_keys_columns = [column for column in df.columns if column not in search_keys.keys()]
|
|
1926
|
+
|
|
1923
1927
|
if email_converted_to_hem:
|
|
1924
1928
|
non_keys_columns.append(email_column)
|
|
1925
1929
|
|
|
@@ -1941,6 +1945,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1941
1945
|
if add_fit_system_record_id:
|
|
1942
1946
|
df = self.__add_fit_system_record_id(df, dict(), search_keys)
|
|
1943
1947
|
df = df.rename(columns={SYSTEM_RECORD_ID: SORT_ID})
|
|
1948
|
+
non_keys_columns.append(SORT_ID)
|
|
1944
1949
|
|
|
1945
1950
|
columns_for_system_record_id = sorted(list(search_keys.keys()) + (original_features_for_transform or []))
|
|
1946
1951
|
|
|
@@ -2883,26 +2888,35 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2883
2888
|
|
|
2884
2889
|
# order by date and idempotent order by other keys
|
|
2885
2890
|
if self.cv not in [CVType.time_series, CVType.blocked_time_series]:
|
|
2891
|
+
sort_exclude_columns = [original_order_name, ORIGINAL_INDEX, EVAL_SET_INDEX, TARGET, "__target"]
|
|
2886
2892
|
if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
|
|
2887
2893
|
date_column = DateTimeSearchKeyConverter.DATETIME_COL
|
|
2894
|
+
sort_exclude_columns.append(self._get_date_column(search_keys))
|
|
2888
2895
|
else:
|
|
2889
2896
|
date_column = self._get_date_column(search_keys)
|
|
2890
2897
|
sort_columns = [date_column] if date_column is not None else []
|
|
2891
2898
|
|
|
2892
|
-
|
|
2899
|
+
other_columns = sorted(
|
|
2893
2900
|
[
|
|
2894
|
-
|
|
2895
|
-
for
|
|
2896
|
-
if
|
|
2897
|
-
and
|
|
2898
|
-
and df[
|
|
2901
|
+
c
|
|
2902
|
+
for c in df.columns
|
|
2903
|
+
if c not in sort_columns
|
|
2904
|
+
and c not in sort_exclude_columns
|
|
2905
|
+
and df[c].nunique() > 1
|
|
2899
2906
|
]
|
|
2907
|
+
# [
|
|
2908
|
+
# sk
|
|
2909
|
+
# for sk, key_type in search_keys.items()
|
|
2910
|
+
# if key_type not in [SearchKey.DATE, SearchKey.DATETIME]
|
|
2911
|
+
# and sk in df.columns
|
|
2912
|
+
# and df[sk].nunique() > 1 # don't use constant keys for hash
|
|
2913
|
+
# ]
|
|
2900
2914
|
)
|
|
2901
2915
|
|
|
2902
2916
|
search_keys_hash = "search_keys_hash"
|
|
2903
|
-
if len(
|
|
2917
|
+
if len(other_columns) > 0:
|
|
2904
2918
|
sort_columns.append(search_keys_hash)
|
|
2905
|
-
df[search_keys_hash] = pd.util.hash_pandas_object(df[
|
|
2919
|
+
df[search_keys_hash] = pd.util.hash_pandas_object(df[other_columns], index=False)
|
|
2906
2920
|
|
|
2907
2921
|
df = df.sort_values(by=sort_columns)
|
|
2908
2922
|
|
|
@@ -146,7 +146,8 @@ dataset_too_many_rows_registered=X rows limit for transform is {}. Please sample
|
|
|
146
146
|
dataset_empty_column_names=Some column names are empty. Add names please
|
|
147
147
|
dataset_full_duplicates=\nWARNING: {:.5f}% of the rows are fully duplicated
|
|
148
148
|
dataset_diff_target_duplicates=\nWARNING: {:.4f}% of rows ({}) in X and eval_set are duplicates with different y values. These rows will be deleted as incorrect\nIncorrect row indexes: {}
|
|
149
|
-
|
|
149
|
+
dataset_train_diff_target_duplicates_fintech=\nWARNING: {:.4f}% of rows ({}) in X are duplicates, not taking into consideration dates, IP addresses and features from the training set, but have different y values. These rows have been removed to optimize search results.\nRemoved row indexes: {}
|
|
150
|
+
dataset_eval_diff_target_duplicates_fintech=\nWARNING: {:.4f}% of rows ({}) in eval{} X are duplicates, not taking into consideration dates, IP addresses and features from the training set, but have different y values. These rows have been removed to optimize search results.\nRemoved row indexes: {}
|
|
150
151
|
dataset_drop_old_dates=\nWARNING: We don't have data before '2000-01-01' and removed all earlier records from the search dataset
|
|
151
152
|
dataset_all_dates_old=There is empty train dataset after removing data before '2000-01-01'
|
|
152
153
|
dataset_invalid_target_type=Unexpected dtype of target for binary task type: {}. Expected int or bool
|
|
@@ -3,7 +3,7 @@ from typing import Dict, List, Optional, Union
|
|
|
3
3
|
|
|
4
4
|
import pandas as pd
|
|
5
5
|
|
|
6
|
-
from upgini.metadata import SORT_ID, SYSTEM_RECORD_ID, TARGET, ModelTaskType, SearchKey
|
|
6
|
+
from upgini.metadata import EVAL_SET_INDEX, SORT_ID, SYSTEM_RECORD_ID, TARGET, ModelTaskType, SearchKey
|
|
7
7
|
from upgini.resource_bundle import ResourceBundle
|
|
8
8
|
from upgini.utils.datetime_utils import DateTimeSearchKeyConverter
|
|
9
9
|
from upgini.utils.target_utils import define_task
|
|
@@ -78,20 +78,58 @@ def remove_fintech_duplicates(
|
|
|
78
78
|
rows_with_diff_target = grouped_by_personal_cols.filter(has_diff_target_within_60_days)
|
|
79
79
|
if len(rows_with_diff_target) > 0:
|
|
80
80
|
unique_keys_to_delete = rows_with_diff_target[personal_cols].drop_duplicates()
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
logger
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
81
|
+
if EVAL_SET_INDEX not in df.columns:
|
|
82
|
+
rows_to_remove = pd.merge(df.reset_index(), unique_keys_to_delete, on=personal_cols)
|
|
83
|
+
rows_to_remove = rows_to_remove.set_index(df.index.name or "index")
|
|
84
|
+
perc = len(rows_to_remove) * 100 / len(df)
|
|
85
|
+
msg = bundle.get("dataset_train_diff_target_duplicates_fintech").format(
|
|
86
|
+
perc, len(rows_to_remove), rows_to_remove.index.to_list()
|
|
87
|
+
)
|
|
88
|
+
if not silent:
|
|
89
|
+
print(msg)
|
|
90
|
+
if logger:
|
|
91
|
+
logger.warning(msg)
|
|
92
|
+
logger.info(f"Dataset shape before clean fintech duplicates: {df.shape}")
|
|
93
|
+
df = df[~df.index.isin(rows_to_remove.index)]
|
|
94
|
+
logger.info(f"Dataset shape after clean fintech duplicates: {df.shape}")
|
|
95
|
+
else:
|
|
96
|
+
# Indices in train and eval_set can be the same so we remove rows from them separately
|
|
97
|
+
train = df.query(f"{EVAL_SET_INDEX} == 0")
|
|
98
|
+
train_rows_to_remove = pd.merge(train.reset_index(), unique_keys_to_delete, on=personal_cols)
|
|
99
|
+
train_rows_to_remove = train_rows_to_remove.set_index(train.index.name or "index")
|
|
100
|
+
train_perc = len(train_rows_to_remove) * 100 / len(train)
|
|
101
|
+
msg = bundle.get("dataset_train_diff_target_duplicates_fintech").format(
|
|
102
|
+
train_perc, len(train_rows_to_remove), train_rows_to_remove.index.to_list()
|
|
103
|
+
)
|
|
104
|
+
if not silent:
|
|
105
|
+
print(msg)
|
|
106
|
+
if logger:
|
|
107
|
+
logger.warning(msg)
|
|
108
|
+
logger.info(f"Train dataset shape before clean fintech duplicates: {train.shape}")
|
|
109
|
+
train = train[~train.index.isin(train_rows_to_remove.index)]
|
|
110
|
+
logger.info(f"Train dataset shape after clean fintech duplicates: {train.shape}")
|
|
111
|
+
|
|
112
|
+
evals = [df.query(f"{EVAL_SET_INDEX} == {i}") for i in df[EVAL_SET_INDEX].unique() if i != 0]
|
|
113
|
+
new_evals = []
|
|
114
|
+
for i, eval in enumerate(evals):
|
|
115
|
+
eval_rows_to_remove = pd.merge(eval.reset_index(), unique_keys_to_delete, on=personal_cols)
|
|
116
|
+
eval_rows_to_remove = eval_rows_to_remove.set_index(eval.index.name or "index")
|
|
117
|
+
eval_perc = len(eval_rows_to_remove) * 100 / len(eval)
|
|
118
|
+
msg = bundle.get("dataset_eval_diff_target_duplicates_fintech").format(
|
|
119
|
+
eval_perc, len(eval_rows_to_remove), i + 1, eval_rows_to_remove.index.to_list()
|
|
120
|
+
)
|
|
121
|
+
if not silent:
|
|
122
|
+
print(msg)
|
|
123
|
+
if logger:
|
|
124
|
+
logger.warning(msg)
|
|
125
|
+
logger.info(f"Eval {i + 1} dataset shape before clean fintech duplicates: {eval.shape}")
|
|
126
|
+
eval = eval[~eval.index.isin(eval_rows_to_remove.index)]
|
|
127
|
+
logger.info(f"Eval {i + 1} dataset shape after clean fintech duplicates: {eval.shape}")
|
|
128
|
+
new_evals.append(eval)
|
|
129
|
+
|
|
130
|
+
logger.info(f"Dataset shape before clean fintech duplicates: {df.shape}")
|
|
131
|
+
df = pd.concat([train] + new_evals)
|
|
132
|
+
logger.info(f"Dataset shape after clean fintech duplicates: {df.shape}")
|
|
95
133
|
return df
|
|
96
134
|
|
|
97
135
|
|
|
@@ -101,14 +139,18 @@ def clean_full_duplicates(
|
|
|
101
139
|
nrows = len(df)
|
|
102
140
|
if nrows == 0:
|
|
103
141
|
return df
|
|
104
|
-
# Remove
|
|
142
|
+
# Remove full duplicates (exclude system_record_id, sort_id and eval_set_index)
|
|
105
143
|
unique_columns = df.columns.tolist()
|
|
106
144
|
if SYSTEM_RECORD_ID in unique_columns:
|
|
107
145
|
unique_columns.remove(SYSTEM_RECORD_ID)
|
|
108
146
|
if SORT_ID in unique_columns:
|
|
109
147
|
unique_columns.remove(SORT_ID)
|
|
148
|
+
if EVAL_SET_INDEX in unique_columns:
|
|
149
|
+
unique_columns.remove(EVAL_SET_INDEX)
|
|
110
150
|
logger.info(f"Dataset shape before clean duplicates: {df.shape}")
|
|
111
|
-
|
|
151
|
+
# Train segment goes first so if duplicates are found in train and eval set
|
|
152
|
+
# then we keep unique rows in train segment
|
|
153
|
+
df = df.drop_duplicates(subset=unique_columns, keep="first")
|
|
112
154
|
logger.info(f"Dataset shape after clean duplicates: {df.shape}")
|
|
113
155
|
nrows_after_full_dedup = len(df)
|
|
114
156
|
share_full_dedup = 100 * (1 - nrows_after_full_dedup / nrows)
|
|
@@ -123,7 +165,7 @@ def clean_full_duplicates(
|
|
|
123
165
|
marked_duplicates = df.duplicated(subset=unique_columns, keep=False)
|
|
124
166
|
if marked_duplicates.sum() > 0:
|
|
125
167
|
dups_indices = df[marked_duplicates].index.to_list()
|
|
126
|
-
nrows_after_tgt_dedup = len(df.drop_duplicates(subset=unique_columns))
|
|
168
|
+
nrows_after_tgt_dedup = len(df.drop_duplicates(subset=unique_columns, keep=False))
|
|
127
169
|
num_dup_rows = nrows_after_full_dedup - nrows_after_tgt_dedup
|
|
128
170
|
share_tgt_dedup = 100 * num_dup_rows / nrows_after_full_dedup
|
|
129
171
|
|
|
@@ -133,6 +175,7 @@ def clean_full_duplicates(
|
|
|
133
175
|
print(msg)
|
|
134
176
|
df = df.drop_duplicates(subset=unique_columns, keep=False)
|
|
135
177
|
logger.info(f"Dataset shape after clean invalid target duplicates: {df.shape}")
|
|
178
|
+
|
|
136
179
|
return df
|
|
137
180
|
|
|
138
181
|
|
|
@@ -2,7 +2,7 @@ upgini/__init__.py,sha256=asENHgEVHQBIkV-e_0IhE_ZWqkCG6398U3ZLrNzAH6k,407
|
|
|
2
2
|
upgini/ads.py,sha256=mre6xn44wcC_fg63iLT_kTh4mViZqR9AKRJZAtpQz8Y,2592
|
|
3
3
|
upgini/dataset.py,sha256=xb4gIANyGbdcuM8Awyq2pJPiH_3k_LEbETApJgAoRBA,45529
|
|
4
4
|
upgini/errors.py,sha256=pdzQl3MKuK52yvncxMWMRWeSIOGhUFzpQoszoRFBOk0,958
|
|
5
|
-
upgini/features_enricher.py,sha256=
|
|
5
|
+
upgini/features_enricher.py,sha256=5rc9vcsCBwmRDb8aAPOFGmkRbC7_zGJGPlaSvkytqCk,172880
|
|
6
6
|
upgini/fingerprint.js,sha256=VygVIQlN1v4NGZfjHqtRogOw8zjTnnMNJg_f7M5iGQU,33442
|
|
7
7
|
upgini/http.py,sha256=zaO86LBBLmkieGbgYifk29eVoPCxXimZQ8YkQtKcM0I,42244
|
|
8
8
|
upgini/metadata.py,sha256=fwVxtkR6Mn4iRoOqV6BfMJvJrx65I3YwZUMbZjhPyOI,9673
|
|
@@ -29,7 +29,7 @@ upgini/normalizer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU
|
|
|
29
29
|
upgini/normalizer/phone_normalizer.py,sha256=lhwsPEnfyjeIsndW2EcQGZksXYsfxaQ1ghAzVYoDRKM,9927
|
|
30
30
|
upgini/resource_bundle/__init__.py,sha256=hdvbqL0b0xMWbY6-kiYGsW1ro2GMiWpxxsO9uCv-h9Q,8379
|
|
31
31
|
upgini/resource_bundle/exceptions.py,sha256=5fRvx0_vWdE1-7HcSgF0tckB4A9AKyf5RiinZkInTsI,621
|
|
32
|
-
upgini/resource_bundle/strings.properties,sha256=
|
|
32
|
+
upgini/resource_bundle/strings.properties,sha256=_bEfgRl2a9sgoy2RxvIf26NemnCW5CM-1AWWpljwZQE,25664
|
|
33
33
|
upgini/resource_bundle/strings_widget.properties,sha256=gOdqvZWntP2LCza_tyVk1_yRYcG4c04K9sQOAVhF_gw,1577
|
|
34
34
|
upgini/sampler/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
35
35
|
upgini/sampler/base.py,sha256=CC-DvPbrN7zp5--SVFuUqkVmdWM_5F7R0Do98ETV82U,6421
|
|
@@ -42,7 +42,7 @@ upgini/utils/country_utils.py,sha256=1KXhLSNqkNYVL3on8-zK0Arc_SspUH7AMZvGZICysOU
|
|
|
42
42
|
upgini/utils/custom_loss_utils.py,sha256=DBslpjWGPt7xTeypt78baR59012SYphbPsO_YLKdilo,3972
|
|
43
43
|
upgini/utils/cv_utils.py,sha256=Tn01RJvpZGZh0PUQUimlBkV-AXwe7s6yjCNFtw352Uc,3525
|
|
44
44
|
upgini/utils/datetime_utils.py,sha256=4ii5WphAHlb_NRmdJx35VZpTarJbAr-AnDw3XSzUSow,10346
|
|
45
|
-
upgini/utils/deduplicate_utils.py,sha256=
|
|
45
|
+
upgini/utils/deduplicate_utils.py,sha256=6AbARehUCghJZ4PppFtrej2s3gFRruh41MEm6mzakHs,8607
|
|
46
46
|
upgini/utils/display_utils.py,sha256=LKoSwjrE0xgS5_cqVhc2og2CQ1UCZ1nTI2VKboIhoQA,10858
|
|
47
47
|
upgini/utils/email_utils.py,sha256=3CvHXTSzlgLyGsQOXfRYVfFhfPy6OXG4uXOBWRaLfHg,3479
|
|
48
48
|
upgini/utils/fallback_progress_bar.py,sha256=cdbd1XGcWm4Ed4eAqV2_St3z7uC_kkH22gEyrN5ub6M,1090
|
|
@@ -56,8 +56,8 @@ upgini/utils/sklearn_ext.py,sha256=fvuTWJ5AnT3ED9KSaQu_yIgW2JR19hFlaGDoVP3k60g,4
|
|
|
56
56
|
upgini/utils/target_utils.py,sha256=5BHcOsBRb4z7P8t3e9rsdXUWUUI7DBmQMmv-x6RwzHM,7152
|
|
57
57
|
upgini/utils/track_info.py,sha256=EPcJ13Jqa17_T0JjM37Ac9kWDz5Zk0GVsIZKutOb8aU,5207
|
|
58
58
|
upgini/utils/warning_counter.py,sha256=dIWBB4dI5XRRJZudvIlqlIYKEiwLLPcXarsZuYRt338,227
|
|
59
|
-
upgini-1.1.
|
|
60
|
-
upgini-1.1.
|
|
61
|
-
upgini-1.1.
|
|
62
|
-
upgini-1.1.
|
|
63
|
-
upgini-1.1.
|
|
59
|
+
upgini-1.1.265.dist-info/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
|
|
60
|
+
upgini-1.1.265.dist-info/METADATA,sha256=HX-CwFFNgXRRuZ00TELhLI1-3ufrny1K0uZc9p0JWdA,48156
|
|
61
|
+
upgini-1.1.265.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
|
62
|
+
upgini-1.1.265.dist-info/top_level.txt,sha256=OFhTGiDIWKl5gFI49qvWq1R9IKflPaE2PekcbDXDtx4,7
|
|
63
|
+
upgini-1.1.265.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|