upgini 1.1.280.dev0__py3-none-any.whl → 1.2.31a1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- upgini/__about__.py +1 -1
- upgini/__init__.py +4 -20
- upgini/autofe/all_operands.py +39 -9
- upgini/autofe/binary.py +148 -45
- upgini/autofe/date.py +197 -26
- upgini/autofe/feature.py +102 -19
- upgini/autofe/groupby.py +22 -22
- upgini/autofe/operand.py +9 -6
- upgini/autofe/unary.py +83 -41
- upgini/autofe/vector.py +8 -8
- upgini/data_source/data_source_publisher.py +128 -5
- upgini/dataset.py +50 -386
- upgini/features_enricher.py +931 -542
- upgini/http.py +27 -16
- upgini/lazy_import.py +35 -0
- upgini/metadata.py +84 -59
- upgini/metrics.py +164 -34
- upgini/normalizer/normalize_utils.py +197 -0
- upgini/resource_bundle/strings.properties +66 -51
- upgini/search_task.py +10 -4
- upgini/utils/Roboto-Regular.ttf +0 -0
- upgini/utils/base_search_key_detector.py +14 -12
- upgini/utils/country_utils.py +16 -0
- upgini/utils/custom_loss_utils.py +39 -36
- upgini/utils/datetime_utils.py +98 -45
- upgini/utils/deduplicate_utils.py +135 -112
- upgini/utils/display_utils.py +46 -15
- upgini/utils/email_utils.py +54 -16
- upgini/utils/feature_info.py +172 -0
- upgini/utils/features_validator.py +34 -20
- upgini/utils/ip_utils.py +100 -1
- upgini/utils/phone_utils.py +343 -0
- upgini/utils/postal_code_utils.py +34 -0
- upgini/utils/sklearn_ext.py +28 -19
- upgini/utils/target_utils.py +113 -57
- upgini/utils/warning_counter.py +1 -0
- upgini/version_validator.py +8 -4
- {upgini-1.1.280.dev0.dist-info → upgini-1.2.31a1.dist-info}/METADATA +31 -16
- upgini-1.2.31a1.dist-info/RECORD +65 -0
- upgini/normalizer/phone_normalizer.py +0 -340
- upgini-1.1.280.dev0.dist-info/RECORD +0 -62
- {upgini-1.1.280.dev0.dist-info → upgini-1.2.31a1.dist-info}/WHEEL +0 -0
- {upgini-1.1.280.dev0.dist-info → upgini-1.2.31a1.dist-info}/licenses/LICENSE +0 -0
upgini/utils/datetime_utils.py
CHANGED
|
@@ -6,15 +6,11 @@ from typing import Dict, List, Optional
|
|
|
6
6
|
import numpy as np
|
|
7
7
|
import pandas as pd
|
|
8
8
|
from dateutil.relativedelta import relativedelta
|
|
9
|
-
from pandas.api.types import
|
|
10
|
-
is_numeric_dtype,
|
|
11
|
-
is_period_dtype,
|
|
12
|
-
)
|
|
9
|
+
from pandas.api.types import is_numeric_dtype
|
|
13
10
|
|
|
14
11
|
from upgini.errors import ValidationError
|
|
15
|
-
from upgini.metadata import SearchKey
|
|
12
|
+
from upgini.metadata import EVAL_SET_INDEX, SearchKey
|
|
16
13
|
from upgini.resource_bundle import ResourceBundle, get_custom_bundle
|
|
17
|
-
from upgini.utils.warning_counter import WarningCounter
|
|
18
14
|
|
|
19
15
|
DATE_FORMATS = [
|
|
20
16
|
"%Y-%m-%d",
|
|
@@ -31,18 +27,20 @@ DATE_FORMATS = [
|
|
|
31
27
|
"%Y-%m-%dT%H:%M:%S.%f",
|
|
32
28
|
]
|
|
33
29
|
|
|
34
|
-
DATETIME_PATTERN = r"^[\d\s\.\-:T]+$"
|
|
30
|
+
DATETIME_PATTERN = r"^[\d\s\.\-:T/+]+$"
|
|
35
31
|
|
|
36
32
|
|
|
37
33
|
class DateTimeSearchKeyConverter:
|
|
38
34
|
DATETIME_COL = "_date_time"
|
|
35
|
+
# MIN_SUPPORTED_DATE_TS = datetime.datetime(1999, 12, 31) # 946684800000 # 2000-01-01
|
|
36
|
+
MIN_SUPPORTED_DATE_TS = pd.to_datetime(datetime.datetime(1999, 12, 31)).tz_localize(None)
|
|
39
37
|
|
|
40
38
|
def __init__(
|
|
41
39
|
self,
|
|
42
40
|
date_column: str,
|
|
43
41
|
date_format: Optional[str] = None,
|
|
44
42
|
logger: Optional[logging.Logger] = None,
|
|
45
|
-
bundle: ResourceBundle = None,
|
|
43
|
+
bundle: Optional[ResourceBundle] = None,
|
|
46
44
|
):
|
|
47
45
|
self.date_column = date_column
|
|
48
46
|
self.date_format = date_format
|
|
@@ -53,6 +51,7 @@ class DateTimeSearchKeyConverter:
|
|
|
53
51
|
self.logger.setLevel("FATAL")
|
|
54
52
|
self.generated_features: List[str] = []
|
|
55
53
|
self.bundle = bundle or get_custom_bundle()
|
|
54
|
+
self.has_old_dates = False
|
|
56
55
|
|
|
57
56
|
@staticmethod
|
|
58
57
|
def _int_to_opt(i: int) -> Optional[int]:
|
|
@@ -81,8 +80,8 @@ class DateTimeSearchKeyConverter:
|
|
|
81
80
|
df[self.date_column] = df[self.date_column].apply(lambda x: x.replace(tzinfo=None))
|
|
82
81
|
elif isinstance(df[self.date_column].values[0], datetime.date):
|
|
83
82
|
df[self.date_column] = pd.to_datetime(df[self.date_column], errors="coerce")
|
|
84
|
-
elif
|
|
85
|
-
df[self.date_column] =
|
|
83
|
+
elif isinstance(df[self.date_column].dtype, pd.PeriodDtype):
|
|
84
|
+
df[self.date_column] = df[self.date_column].dt.to_timestamp()
|
|
86
85
|
elif is_numeric_dtype(df[self.date_column]):
|
|
87
86
|
# 315532801 - 2524608001 - seconds
|
|
88
87
|
# 315532801000 - 2524608001000 - milliseconds
|
|
@@ -94,11 +93,10 @@ class DateTimeSearchKeyConverter:
|
|
|
94
93
|
df[self.date_column] = pd.to_datetime(df[self.date_column], unit="us")
|
|
95
94
|
elif df[self.date_column].apply(lambda x: 10**11 < x < 10**14).all():
|
|
96
95
|
df[self.date_column] = pd.to_datetime(df[self.date_column], unit="ms")
|
|
97
|
-
elif df[self.date_column].apply(lambda x: 0 < x < 10
|
|
96
|
+
elif df[self.date_column].apply(lambda x: 0 < x < 10**11).all():
|
|
98
97
|
df[self.date_column] = pd.to_datetime(df[self.date_column], unit="s")
|
|
99
98
|
else:
|
|
100
99
|
msg = self.bundle.get("unsupported_date_type").format(self.date_column)
|
|
101
|
-
self.logger.warning(msg)
|
|
102
100
|
raise ValidationError(msg)
|
|
103
101
|
else:
|
|
104
102
|
df[self.date_column] = df[self.date_column].astype("string").apply(self.clean_date)
|
|
@@ -108,20 +106,66 @@ class DateTimeSearchKeyConverter:
|
|
|
108
106
|
# as additional features
|
|
109
107
|
seconds = "datetime_seconds"
|
|
110
108
|
df[self.date_column] = df[self.date_column].dt.tz_localize(None)
|
|
109
|
+
|
|
110
|
+
df = self.clean_old_dates(df)
|
|
111
|
+
|
|
112
|
+
# Define function to apply sine and cosine transformations
|
|
113
|
+
def add_cyclical_features(df, column, period):
|
|
114
|
+
period_suffix = f"_{period}" if column != "day_in_quarter" else ""
|
|
115
|
+
sin_feature = f"datetime_{column}_sin{period_suffix}"
|
|
116
|
+
cos_feature = f"datetime_{column}_cos{period_suffix}"
|
|
117
|
+
if sin_feature not in df.columns:
|
|
118
|
+
df[sin_feature] = np.sin(2 * np.pi * df[column] / period)
|
|
119
|
+
self.generated_features.append(sin_feature)
|
|
120
|
+
if cos_feature not in df.columns:
|
|
121
|
+
df[cos_feature] = np.cos(2 * np.pi * df[column] / period)
|
|
122
|
+
self.generated_features.append(cos_feature)
|
|
123
|
+
|
|
124
|
+
df["quarter"] = df[self.date_column].dt.quarter
|
|
125
|
+
|
|
126
|
+
# Calculate the start date of the quarter for each timestamp
|
|
127
|
+
df["quarter_start"] = df[self.date_column].dt.to_period("Q").dt.start_time
|
|
128
|
+
|
|
129
|
+
# Calculate the day in the quarter
|
|
130
|
+
df["day_in_quarter"] = (df[self.date_column] - df["quarter_start"]).dt.days + 1
|
|
131
|
+
|
|
132
|
+
# Vectorized calculation of days_in_quarter
|
|
133
|
+
quarter = df["quarter"]
|
|
134
|
+
start = df["quarter_start"]
|
|
135
|
+
year = start.dt.year
|
|
136
|
+
month = start.dt.month
|
|
137
|
+
|
|
138
|
+
quarter_end_year = np.where(quarter == 4, year + 1, year)
|
|
139
|
+
quarter_end_month = np.where(quarter == 4, 1, month + 3)
|
|
140
|
+
|
|
141
|
+
end = pd.to_datetime({"year": quarter_end_year, "month": quarter_end_month, "day": 1})
|
|
142
|
+
end.index = df.index
|
|
143
|
+
|
|
144
|
+
df["days_in_quarter"] = (end - start).dt.days
|
|
145
|
+
|
|
146
|
+
add_cyclical_features(df, "day_in_quarter", df["days_in_quarter"]) # Days in the quarter
|
|
147
|
+
|
|
148
|
+
df.drop(columns=["quarter", "quarter_start", "day_in_quarter", "days_in_quarter"], inplace=True)
|
|
149
|
+
|
|
111
150
|
df[seconds] = (df[self.date_column] - df[self.date_column].dt.floor("D")).dt.seconds
|
|
112
151
|
|
|
113
152
|
seconds_without_na = df[seconds].dropna()
|
|
114
153
|
if (seconds_without_na != 0).any() and seconds_without_na.nunique() > 1:
|
|
115
154
|
self.logger.info("Time found in date search key. Add extra features based on time")
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
155
|
+
|
|
156
|
+
# Extract basic components
|
|
157
|
+
df["second"] = df[self.date_column].dt.second
|
|
158
|
+
df["minute"] = df[self.date_column].dt.minute
|
|
159
|
+
df["hour"] = df[self.date_column].dt.hour
|
|
160
|
+
|
|
161
|
+
# Apply cyclical transformations
|
|
162
|
+
add_cyclical_features(df, "second", 60) # Seconds in a minute
|
|
163
|
+
add_cyclical_features(df, "minute", 60) # Minutes in an hour
|
|
164
|
+
add_cyclical_features(df, "minute", 30) # Minutes in half an hour
|
|
165
|
+
add_cyclical_features(df, "hour", 24) # Hours in a day
|
|
166
|
+
|
|
167
|
+
# Drop intermediate columns if not needed
|
|
168
|
+
df.drop(columns=["second", "minute", "hour"], inplace=True)
|
|
125
169
|
|
|
126
170
|
df.drop(columns=seconds, inplace=True)
|
|
127
171
|
|
|
@@ -147,7 +191,19 @@ class DateTimeSearchKeyConverter:
|
|
|
147
191
|
return pd.to_datetime(df[self.date_column], format=date_format)
|
|
148
192
|
except ValueError:
|
|
149
193
|
pass
|
|
150
|
-
|
|
194
|
+
try:
|
|
195
|
+
return pd.to_datetime(df[self.date_column])
|
|
196
|
+
except ValueError:
|
|
197
|
+
raise ValidationError(self.bundle.get("invalid_date_format").format(self.date_column))
|
|
198
|
+
|
|
199
|
+
def clean_old_dates(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
200
|
+
condition = df[self.date_column] <= self.MIN_SUPPORTED_DATE_TS
|
|
201
|
+
old_subset = df[condition]
|
|
202
|
+
if len(old_subset) > 0:
|
|
203
|
+
self.has_old_dates = True
|
|
204
|
+
df.loc[condition, self.date_column] = None
|
|
205
|
+
self.logger.info(f"Set to None: {len(old_subset)} of {len(df)} rows because they are before 2000-01-01")
|
|
206
|
+
return df
|
|
151
207
|
|
|
152
208
|
|
|
153
209
|
def is_time_series(df: pd.DataFrame, date_col: str) -> bool:
|
|
@@ -185,7 +241,10 @@ def is_time_series(df: pd.DataFrame, date_col: str) -> bool:
|
|
|
185
241
|
def is_blocked_time_series(df: pd.DataFrame, date_col: str, search_keys: List[str]) -> bool:
|
|
186
242
|
df = df.copy()
|
|
187
243
|
seconds = "datetime_seconds"
|
|
188
|
-
df[date_col]
|
|
244
|
+
if isinstance(df[date_col].dtype, pd.PeriodDtype):
|
|
245
|
+
df[date_col] = df[date_col].dt.to_timestamp()
|
|
246
|
+
else:
|
|
247
|
+
df[date_col] = pd.to_datetime(df[date_col])
|
|
189
248
|
df[date_col] = df[date_col].dt.tz_localize(None)
|
|
190
249
|
df[seconds] = (df[date_col] - df[date_col].dt.floor("D")).dt.seconds
|
|
191
250
|
|
|
@@ -231,24 +290,25 @@ def is_blocked_time_series(df: pd.DataFrame, date_col: str, search_keys: List[st
|
|
|
231
290
|
return len(accumulated_changing_columns) <= 2
|
|
232
291
|
|
|
233
292
|
|
|
234
|
-
def
|
|
235
|
-
|
|
293
|
+
def is_dates_distribution_valid(
|
|
294
|
+
df: pd.DataFrame,
|
|
236
295
|
search_keys: Dict[str, SearchKey],
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
maybe_date_col = key
|
|
296
|
+
) -> bool:
|
|
297
|
+
maybe_date_col = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
|
|
298
|
+
|
|
299
|
+
if EVAL_SET_INDEX in df.columns:
|
|
300
|
+
X = df.query(f"{EVAL_SET_INDEX} == 0")
|
|
301
|
+
else:
|
|
302
|
+
X = df
|
|
245
303
|
|
|
246
304
|
if maybe_date_col is None:
|
|
247
305
|
for col in X.columns:
|
|
248
306
|
if col in search_keys:
|
|
249
307
|
continue
|
|
250
308
|
try:
|
|
251
|
-
if
|
|
309
|
+
if isinstance(X[col].dtype, pd.PeriodDtype):
|
|
310
|
+
pass
|
|
311
|
+
elif pd.__version__ >= "2.0.0":
|
|
252
312
|
# Format mixed to avoid massive warnings
|
|
253
313
|
pd.to_datetime(X[col], format="mixed")
|
|
254
314
|
else:
|
|
@@ -261,7 +321,9 @@ def validate_dates_distribution(
|
|
|
261
321
|
if maybe_date_col is None:
|
|
262
322
|
return
|
|
263
323
|
|
|
264
|
-
if
|
|
324
|
+
if isinstance(X[maybe_date_col].dtype, pd.PeriodDtype):
|
|
325
|
+
dates = X[maybe_date_col].dt.to_timestamp().dt.date
|
|
326
|
+
elif pd.__version__ >= "2.0.0":
|
|
265
327
|
dates = pd.to_datetime(X[maybe_date_col], format="mixed").dt.date
|
|
266
328
|
else:
|
|
267
329
|
dates = pd.to_datetime(X[maybe_date_col]).dt.date
|
|
@@ -272,13 +334,4 @@ def validate_dates_distribution(
|
|
|
272
334
|
date_counts_2 = date_counts[round(len(date_counts) / 2) :]
|
|
273
335
|
ratio = date_counts_2.mean() / date_counts_1.mean()
|
|
274
336
|
|
|
275
|
-
|
|
276
|
-
if warning_counter is not None:
|
|
277
|
-
warning_counter.increment()
|
|
278
|
-
if logger is None:
|
|
279
|
-
logger = logging.getLogger("muted_logger")
|
|
280
|
-
logger.setLevel("FATAL")
|
|
281
|
-
bundle = bundle or get_custom_bundle()
|
|
282
|
-
msg = bundle.get("x_unstable_by_date")
|
|
283
|
-
print(msg)
|
|
284
|
-
logger.warning(msg)
|
|
337
|
+
return ratio >= 0.8 and ratio <= 1.2
|
|
@@ -1,10 +1,19 @@
|
|
|
1
|
+
import logging
|
|
1
2
|
from logging import Logger
|
|
2
|
-
from typing import Dict, List, Optional, Union
|
|
3
|
+
from typing import Dict, List, Optional, Tuple, Union
|
|
3
4
|
|
|
4
5
|
import pandas as pd
|
|
5
6
|
|
|
6
|
-
from upgini.metadata import
|
|
7
|
-
|
|
7
|
+
from upgini.metadata import (
|
|
8
|
+
ENTITY_SYSTEM_RECORD_ID,
|
|
9
|
+
EVAL_SET_INDEX,
|
|
10
|
+
SORT_ID,
|
|
11
|
+
SYSTEM_RECORD_ID,
|
|
12
|
+
TARGET,
|
|
13
|
+
ModelTaskType,
|
|
14
|
+
SearchKey,
|
|
15
|
+
)
|
|
16
|
+
from upgini.resource_bundle import ResourceBundle, get_custom_bundle
|
|
8
17
|
from upgini.utils.datetime_utils import DateTimeSearchKeyConverter
|
|
9
18
|
from upgini.utils.target_utils import define_task
|
|
10
19
|
|
|
@@ -14,17 +23,19 @@ def remove_fintech_duplicates(
|
|
|
14
23
|
search_keys: Dict[str, SearchKey],
|
|
15
24
|
date_format: Optional[str] = None,
|
|
16
25
|
logger: Optional[Logger] = None,
|
|
17
|
-
silent=False,
|
|
18
26
|
bundle: ResourceBundle = None,
|
|
19
|
-
) -> pd.DataFrame:
|
|
20
|
-
#
|
|
27
|
+
) -> Tuple[pd.DataFrame, Optional[List[str]]]:
|
|
28
|
+
# Initial checks for target type and date column
|
|
29
|
+
bundle = bundle or get_custom_bundle()
|
|
30
|
+
if logger is None:
|
|
31
|
+
logger = logging.getLogger()
|
|
32
|
+
logger.setLevel(logging.FATAL)
|
|
21
33
|
date_col = _get_column_by_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
|
|
22
34
|
if define_task(df[TARGET], date_col is not None, silent=True) != ModelTaskType.BINARY:
|
|
23
|
-
return df
|
|
35
|
+
return df, []
|
|
24
36
|
|
|
25
|
-
date_col = _get_column_by_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
|
|
26
37
|
if date_col is None:
|
|
27
|
-
return df
|
|
38
|
+
return df, []
|
|
28
39
|
|
|
29
40
|
personal_cols = []
|
|
30
41
|
phone_col = _get_column_by_key(search_keys, SearchKey.PHONE)
|
|
@@ -37,116 +48,133 @@ def remove_fintech_duplicates(
|
|
|
37
48
|
if hem_col:
|
|
38
49
|
personal_cols.append(hem_col)
|
|
39
50
|
if len(personal_cols) == 0:
|
|
40
|
-
return df
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
if
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
)
|
|
88
|
-
if not silent:
|
|
89
|
-
print(msg)
|
|
90
|
-
if logger:
|
|
91
|
-
logger.warning(msg)
|
|
92
|
-
logger.info(f"Dataset shape before clean fintech duplicates: {df.shape}")
|
|
93
|
-
df = df[~df.index.isin(rows_to_remove.index)]
|
|
94
|
-
logger.info(f"Dataset shape after clean fintech duplicates: {df.shape}")
|
|
95
|
-
else:
|
|
96
|
-
# Indices in train and eval_set can be the same so we remove rows from them separately
|
|
97
|
-
train = df.query(f"{EVAL_SET_INDEX} == 0")
|
|
98
|
-
train_rows_to_remove = pd.merge(train.reset_index(), unique_keys_to_delete, on=personal_cols)
|
|
99
|
-
train_rows_to_remove = train_rows_to_remove.set_index(train.index.name or "index")
|
|
100
|
-
train_perc = len(train_rows_to_remove) * 100 / len(train)
|
|
101
|
-
msg = bundle.get("dataset_train_diff_target_duplicates_fintech").format(
|
|
102
|
-
train_perc, len(train_rows_to_remove), train_rows_to_remove.index.to_list()
|
|
51
|
+
return df, []
|
|
52
|
+
|
|
53
|
+
# Splitting into train and eval_set parts
|
|
54
|
+
if EVAL_SET_INDEX in df.columns:
|
|
55
|
+
train_df = df[df[EVAL_SET_INDEX] == 0]
|
|
56
|
+
eval_dfs = [df[df[EVAL_SET_INDEX] == idx] for idx in df[EVAL_SET_INDEX].unique() if idx != 0]
|
|
57
|
+
else:
|
|
58
|
+
train_df = df
|
|
59
|
+
eval_dfs = []
|
|
60
|
+
|
|
61
|
+
warning_messages = []
|
|
62
|
+
|
|
63
|
+
def process_df(segment_df: pd.DataFrame, eval_index=0) -> Tuple[pd.DataFrame, Optional[str]]:
|
|
64
|
+
"""Process a subset of the dataset to remove duplicates based on personal keys."""
|
|
65
|
+
# Fast check for duplicates based on personal keys
|
|
66
|
+
if not segment_df[personal_cols].duplicated().any():
|
|
67
|
+
return segment_df, None
|
|
68
|
+
|
|
69
|
+
sub_df = segment_df[personal_cols + [date_col, TARGET]].copy()
|
|
70
|
+
|
|
71
|
+
# Group by personal columns to check for unique dates
|
|
72
|
+
grouped_by_personal_cols = sub_df.groupby(personal_cols, group_keys=False)
|
|
73
|
+
|
|
74
|
+
# Checking for different dates by the same personal keys
|
|
75
|
+
uniques = grouped_by_personal_cols[date_col].nunique()
|
|
76
|
+
total = len(uniques)
|
|
77
|
+
diff_dates = len(uniques[uniques > 1])
|
|
78
|
+
if diff_dates / total >= 0.6:
|
|
79
|
+
return segment_df, None
|
|
80
|
+
|
|
81
|
+
# Check for duplicate rows
|
|
82
|
+
duplicates = sub_df.duplicated(personal_cols, keep=False)
|
|
83
|
+
duplicate_rows = sub_df[duplicates]
|
|
84
|
+
if len(duplicate_rows) == 0:
|
|
85
|
+
return segment_df, None
|
|
86
|
+
|
|
87
|
+
# Check if there are different target values for the same personal keys
|
|
88
|
+
nonunique_target_groups = grouped_by_personal_cols[TARGET].nunique() > 1
|
|
89
|
+
if nonunique_target_groups.sum() == 0:
|
|
90
|
+
return segment_df, None
|
|
91
|
+
|
|
92
|
+
# Helper function to check if there are different target values within 60 days
|
|
93
|
+
def has_diff_target_within_60_days(rows: pd.DataFrame):
|
|
94
|
+
rows = rows.sort_values(by=date_col)
|
|
95
|
+
return (
|
|
96
|
+
len(rows[rows[TARGET].ne(rows[TARGET].shift()) & (rows[date_col].diff() < 60 * 24 * 60 * 60 * 1000)])
|
|
97
|
+
> 0
|
|
103
98
|
)
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
99
|
+
|
|
100
|
+
# Filter rows with different target values within 60 days
|
|
101
|
+
nonunique_target_rows = nonunique_target_groups[nonunique_target_groups].reset_index().drop(columns=TARGET)
|
|
102
|
+
sub_df = pd.merge(sub_df, nonunique_target_rows, on=personal_cols)
|
|
103
|
+
|
|
104
|
+
# Convert date columns for further checks
|
|
105
|
+
sub_df = DateTimeSearchKeyConverter(date_col, date_format=date_format, logger=logger, bundle=bundle).convert(
|
|
106
|
+
sub_df
|
|
107
|
+
)
|
|
108
|
+
grouped_by_personal_cols = sub_df.groupby(personal_cols, group_keys=False)
|
|
109
|
+
rows_with_diff_target = grouped_by_personal_cols.filter(has_diff_target_within_60_days)
|
|
110
|
+
|
|
111
|
+
if len(rows_with_diff_target) > 0:
|
|
112
|
+
unique_keys_to_delete = rows_with_diff_target[personal_cols].drop_duplicates()
|
|
113
|
+
rows_to_remove = pd.merge(segment_df.reset_index(), unique_keys_to_delete, on=personal_cols)
|
|
114
|
+
rows_to_remove = rows_to_remove.set_index(segment_df.index.name or "index")
|
|
115
|
+
perc = len(rows_to_remove) * 100 / len(segment_df)
|
|
116
|
+
if eval_index == 0:
|
|
117
|
+
msg = bundle.get("dataset_train_diff_target_duplicates_fintech").format(
|
|
118
|
+
perc, len(rows_to_remove), rows_to_remove.index.to_list()
|
|
119
|
+
)
|
|
120
|
+
else:
|
|
118
121
|
msg = bundle.get("dataset_eval_diff_target_duplicates_fintech").format(
|
|
119
|
-
|
|
122
|
+
perc, len(rows_to_remove), eval_index, rows_to_remove.index.to_list()
|
|
120
123
|
)
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
124
|
+
return segment_df[~segment_df.index.isin(rows_to_remove.index)], msg
|
|
125
|
+
return segment_df, None
|
|
126
|
+
|
|
127
|
+
# Process the train part separately
|
|
128
|
+
logger.info(f"Train dataset shape before clean fintech duplicates: {train_df.shape}")
|
|
129
|
+
train_df, train_warning = process_df(train_df)
|
|
130
|
+
if train_warning:
|
|
131
|
+
warning_messages.append(train_warning)
|
|
132
|
+
logger.info(f"Train dataset shape after clean fintech duplicates: {train_df.shape}")
|
|
133
|
+
|
|
134
|
+
# Process each eval_set part separately
|
|
135
|
+
new_eval_dfs = []
|
|
136
|
+
for i, eval_df in enumerate(eval_dfs, 1):
|
|
137
|
+
logger.info(f"Eval {i} dataset shape before clean fintech duplicates: {eval_df.shape}")
|
|
138
|
+
cleaned_eval_df, eval_warning = process_df(eval_df, i)
|
|
139
|
+
if eval_warning:
|
|
140
|
+
warning_messages.append(eval_warning)
|
|
141
|
+
logger.info(f"Eval {i} dataset shape after clean fintech duplicates: {cleaned_eval_df.shape}")
|
|
142
|
+
new_eval_dfs.append(cleaned_eval_df)
|
|
143
|
+
|
|
144
|
+
# Combine the processed train and eval parts back into one dataset
|
|
145
|
+
logger.info(f"Dataset shape before clean fintech duplicates: {df.shape}")
|
|
146
|
+
if new_eval_dfs:
|
|
147
|
+
df = pd.concat([train_df] + new_eval_dfs)
|
|
148
|
+
else:
|
|
149
|
+
df = train_df
|
|
150
|
+
logger.info(f"Dataset shape after clean fintech duplicates: {df.shape}")
|
|
151
|
+
|
|
152
|
+
return df, warning_messages
|
|
134
153
|
|
|
135
154
|
|
|
136
155
|
def clean_full_duplicates(
|
|
137
|
-
df: pd.DataFrame, logger: Optional[Logger] = None,
|
|
138
|
-
) -> pd.DataFrame:
|
|
156
|
+
df: pd.DataFrame, logger: Optional[Logger] = None, bundle: Optional[ResourceBundle] = None
|
|
157
|
+
) -> Tuple[pd.DataFrame, Optional[str]]:
|
|
158
|
+
if logger is None:
|
|
159
|
+
logger = logging.getLogger()
|
|
160
|
+
logger.setLevel(logging.FATAL)
|
|
161
|
+
if bundle is None:
|
|
162
|
+
bundle = get_custom_bundle()
|
|
163
|
+
|
|
139
164
|
nrows = len(df)
|
|
140
165
|
if nrows == 0:
|
|
141
|
-
return df
|
|
166
|
+
return df, None
|
|
142
167
|
# Remove full duplicates (exclude system_record_id, sort_id and eval_set_index)
|
|
143
168
|
unique_columns = df.columns.tolist()
|
|
144
169
|
if SYSTEM_RECORD_ID in unique_columns:
|
|
145
170
|
unique_columns.remove(SYSTEM_RECORD_ID)
|
|
171
|
+
if ENTITY_SYSTEM_RECORD_ID in unique_columns:
|
|
172
|
+
unique_columns.remove(ENTITY_SYSTEM_RECORD_ID)
|
|
146
173
|
if SORT_ID in unique_columns:
|
|
147
174
|
unique_columns.remove(SORT_ID)
|
|
148
175
|
if EVAL_SET_INDEX in unique_columns:
|
|
149
176
|
unique_columns.remove(EVAL_SET_INDEX)
|
|
177
|
+
|
|
150
178
|
logger.info(f"Dataset shape before clean duplicates: {df.shape}")
|
|
151
179
|
# Train segment goes first so if duplicates are found in train and eval set
|
|
152
180
|
# then we keep unique rows in train segment
|
|
@@ -155,11 +183,9 @@ def clean_full_duplicates(
|
|
|
155
183
|
nrows_after_full_dedup = len(df)
|
|
156
184
|
share_full_dedup = 100 * (1 - nrows_after_full_dedup / nrows)
|
|
157
185
|
if share_full_dedup > 0:
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
# print(msg)
|
|
162
|
-
# self.warning_counter.increment()
|
|
186
|
+
logger.warning(bundle.get("dataset_full_duplicates").format(share_full_dedup))
|
|
187
|
+
|
|
188
|
+
msg = None
|
|
163
189
|
if TARGET in df.columns:
|
|
164
190
|
unique_columns.remove(TARGET)
|
|
165
191
|
marked_duplicates = df.duplicated(subset=unique_columns, keep=False)
|
|
@@ -170,13 +196,10 @@ def clean_full_duplicates(
|
|
|
170
196
|
share_tgt_dedup = 100 * num_dup_rows / nrows_after_full_dedup
|
|
171
197
|
|
|
172
198
|
msg = bundle.get("dataset_diff_target_duplicates").format(share_tgt_dedup, num_dup_rows, dups_indices)
|
|
173
|
-
logger.warning(msg)
|
|
174
|
-
if not silent:
|
|
175
|
-
print(msg)
|
|
176
199
|
df = df.drop_duplicates(subset=unique_columns, keep=False)
|
|
177
200
|
logger.info(f"Dataset shape after clean invalid target duplicates: {df.shape}")
|
|
178
201
|
|
|
179
|
-
return df
|
|
202
|
+
return df, msg
|
|
180
203
|
|
|
181
204
|
|
|
182
205
|
def _get_column_by_key(search_keys: Dict[str, SearchKey], keys: Union[SearchKey, List[SearchKey]]) -> Optional[str]:
|