upgini 1.1.280a3418.post2__py3-none-any.whl → 1.2.31a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

Files changed (43) hide show
  1. upgini/__about__.py +1 -1
  2. upgini/__init__.py +4 -20
  3. upgini/autofe/all_operands.py +39 -10
  4. upgini/autofe/binary.py +148 -45
  5. upgini/autofe/date.py +197 -26
  6. upgini/autofe/feature.py +102 -19
  7. upgini/autofe/groupby.py +22 -22
  8. upgini/autofe/operand.py +9 -6
  9. upgini/autofe/unary.py +78 -54
  10. upgini/autofe/vector.py +8 -8
  11. upgini/data_source/data_source_publisher.py +128 -5
  12. upgini/dataset.py +50 -386
  13. upgini/features_enricher.py +936 -541
  14. upgini/http.py +27 -16
  15. upgini/lazy_import.py +35 -0
  16. upgini/metadata.py +84 -59
  17. upgini/metrics.py +164 -34
  18. upgini/normalizer/normalize_utils.py +197 -0
  19. upgini/resource_bundle/strings.properties +66 -51
  20. upgini/search_task.py +10 -4
  21. upgini/utils/Roboto-Regular.ttf +0 -0
  22. upgini/utils/base_search_key_detector.py +14 -12
  23. upgini/utils/country_utils.py +16 -0
  24. upgini/utils/custom_loss_utils.py +39 -36
  25. upgini/utils/datetime_utils.py +98 -45
  26. upgini/utils/deduplicate_utils.py +135 -112
  27. upgini/utils/display_utils.py +46 -15
  28. upgini/utils/email_utils.py +54 -16
  29. upgini/utils/feature_info.py +172 -0
  30. upgini/utils/features_validator.py +34 -20
  31. upgini/utils/ip_utils.py +100 -1
  32. upgini/utils/phone_utils.py +343 -0
  33. upgini/utils/postal_code_utils.py +34 -0
  34. upgini/utils/sklearn_ext.py +28 -19
  35. upgini/utils/target_utils.py +113 -57
  36. upgini/utils/warning_counter.py +1 -0
  37. upgini/version_validator.py +8 -4
  38. {upgini-1.1.280a3418.post2.dist-info → upgini-1.2.31a1.dist-info}/METADATA +31 -16
  39. upgini-1.2.31a1.dist-info/RECORD +65 -0
  40. upgini/normalizer/phone_normalizer.py +0 -340
  41. upgini-1.1.280a3418.post2.dist-info/RECORD +0 -62
  42. {upgini-1.1.280a3418.post2.dist-info → upgini-1.2.31a1.dist-info}/WHEEL +0 -0
  43. {upgini-1.1.280a3418.post2.dist-info → upgini-1.2.31a1.dist-info}/licenses/LICENSE +0 -0
@@ -6,15 +6,11 @@ from typing import Dict, List, Optional
6
6
  import numpy as np
7
7
  import pandas as pd
8
8
  from dateutil.relativedelta import relativedelta
9
- from pandas.api.types import (
10
- is_numeric_dtype,
11
- is_period_dtype,
12
- )
9
+ from pandas.api.types import is_numeric_dtype
13
10
 
14
11
  from upgini.errors import ValidationError
15
- from upgini.metadata import SearchKey
12
+ from upgini.metadata import EVAL_SET_INDEX, SearchKey
16
13
  from upgini.resource_bundle import ResourceBundle, get_custom_bundle
17
- from upgini.utils.warning_counter import WarningCounter
18
14
 
19
15
  DATE_FORMATS = [
20
16
  "%Y-%m-%d",
@@ -31,18 +27,20 @@ DATE_FORMATS = [
31
27
  "%Y-%m-%dT%H:%M:%S.%f",
32
28
  ]
33
29
 
34
- DATETIME_PATTERN = r"^[\d\s\.\-:T]+$"
30
+ DATETIME_PATTERN = r"^[\d\s\.\-:T/+]+$"
35
31
 
36
32
 
37
33
  class DateTimeSearchKeyConverter:
38
34
  DATETIME_COL = "_date_time"
35
+ # MIN_SUPPORTED_DATE_TS = datetime.datetime(1999, 12, 31) # 946684800000 # 2000-01-01
36
+ MIN_SUPPORTED_DATE_TS = pd.to_datetime(datetime.datetime(1999, 12, 31)).tz_localize(None)
39
37
 
40
38
  def __init__(
41
39
  self,
42
40
  date_column: str,
43
41
  date_format: Optional[str] = None,
44
42
  logger: Optional[logging.Logger] = None,
45
- bundle: ResourceBundle = None,
43
+ bundle: Optional[ResourceBundle] = None,
46
44
  ):
47
45
  self.date_column = date_column
48
46
  self.date_format = date_format
@@ -53,6 +51,7 @@ class DateTimeSearchKeyConverter:
53
51
  self.logger.setLevel("FATAL")
54
52
  self.generated_features: List[str] = []
55
53
  self.bundle = bundle or get_custom_bundle()
54
+ self.has_old_dates = False
56
55
 
57
56
  @staticmethod
58
57
  def _int_to_opt(i: int) -> Optional[int]:
@@ -81,8 +80,8 @@ class DateTimeSearchKeyConverter:
81
80
  df[self.date_column] = df[self.date_column].apply(lambda x: x.replace(tzinfo=None))
82
81
  elif isinstance(df[self.date_column].values[0], datetime.date):
83
82
  df[self.date_column] = pd.to_datetime(df[self.date_column], errors="coerce")
84
- elif is_period_dtype(df[self.date_column]):
85
- df[self.date_column] = pd.to_datetime(df[self.date_column].astype("string"))
83
+ elif isinstance(df[self.date_column].dtype, pd.PeriodDtype):
84
+ df[self.date_column] = df[self.date_column].dt.to_timestamp()
86
85
  elif is_numeric_dtype(df[self.date_column]):
87
86
  # 315532801 - 2524608001 - seconds
88
87
  # 315532801000 - 2524608001000 - milliseconds
@@ -94,11 +93,10 @@ class DateTimeSearchKeyConverter:
94
93
  df[self.date_column] = pd.to_datetime(df[self.date_column], unit="us")
95
94
  elif df[self.date_column].apply(lambda x: 10**11 < x < 10**14).all():
96
95
  df[self.date_column] = pd.to_datetime(df[self.date_column], unit="ms")
97
- elif df[self.date_column].apply(lambda x: 0 < x < 10 * 11).all():
96
+ elif df[self.date_column].apply(lambda x: 0 < x < 10**11).all():
98
97
  df[self.date_column] = pd.to_datetime(df[self.date_column], unit="s")
99
98
  else:
100
99
  msg = self.bundle.get("unsupported_date_type").format(self.date_column)
101
- self.logger.warning(msg)
102
100
  raise ValidationError(msg)
103
101
  else:
104
102
  df[self.date_column] = df[self.date_column].astype("string").apply(self.clean_date)
@@ -108,20 +106,66 @@ class DateTimeSearchKeyConverter:
108
106
  # as additional features
109
107
  seconds = "datetime_seconds"
110
108
  df[self.date_column] = df[self.date_column].dt.tz_localize(None)
109
+
110
+ df = self.clean_old_dates(df)
111
+
112
+ # Define function to apply sine and cosine transformations
113
+ def add_cyclical_features(df, column, period):
114
+ period_suffix = f"_{period}" if column != "day_in_quarter" else ""
115
+ sin_feature = f"datetime_{column}_sin{period_suffix}"
116
+ cos_feature = f"datetime_{column}_cos{period_suffix}"
117
+ if sin_feature not in df.columns:
118
+ df[sin_feature] = np.sin(2 * np.pi * df[column] / period)
119
+ self.generated_features.append(sin_feature)
120
+ if cos_feature not in df.columns:
121
+ df[cos_feature] = np.cos(2 * np.pi * df[column] / period)
122
+ self.generated_features.append(cos_feature)
123
+
124
+ df["quarter"] = df[self.date_column].dt.quarter
125
+
126
+ # Calculate the start date of the quarter for each timestamp
127
+ df["quarter_start"] = df[self.date_column].dt.to_period("Q").dt.start_time
128
+
129
+ # Calculate the day in the quarter
130
+ df["day_in_quarter"] = (df[self.date_column] - df["quarter_start"]).dt.days + 1
131
+
132
+ # Vectorized calculation of days_in_quarter
133
+ quarter = df["quarter"]
134
+ start = df["quarter_start"]
135
+ year = start.dt.year
136
+ month = start.dt.month
137
+
138
+ quarter_end_year = np.where(quarter == 4, year + 1, year)
139
+ quarter_end_month = np.where(quarter == 4, 1, month + 3)
140
+
141
+ end = pd.to_datetime({"year": quarter_end_year, "month": quarter_end_month, "day": 1})
142
+ end.index = df.index
143
+
144
+ df["days_in_quarter"] = (end - start).dt.days
145
+
146
+ add_cyclical_features(df, "day_in_quarter", df["days_in_quarter"]) # Days in the quarter
147
+
148
+ df.drop(columns=["quarter", "quarter_start", "day_in_quarter", "days_in_quarter"], inplace=True)
149
+
111
150
  df[seconds] = (df[self.date_column] - df[self.date_column].dt.floor("D")).dt.seconds
112
151
 
113
152
  seconds_without_na = df[seconds].dropna()
114
153
  if (seconds_without_na != 0).any() and seconds_without_na.nunique() > 1:
115
154
  self.logger.info("Time found in date search key. Add extra features based on time")
116
- seconds_in_day = 60 * 60 * 24
117
- orders = [1, 2, 24, 48]
118
- for order in orders:
119
- sin_feature = f"datetime_time_sin_{order}"
120
- cos_feature = f"datetime_time_cos_{order}"
121
- df[sin_feature] = np.round(np.sin(2 * np.pi * order * df[seconds] / seconds_in_day), 10)
122
- df[cos_feature] = np.round(np.cos(2 * np.pi * order * df[seconds] / seconds_in_day), 10)
123
- self.generated_features.append(sin_feature)
124
- self.generated_features.append(cos_feature)
155
+
156
+ # Extract basic components
157
+ df["second"] = df[self.date_column].dt.second
158
+ df["minute"] = df[self.date_column].dt.minute
159
+ df["hour"] = df[self.date_column].dt.hour
160
+
161
+ # Apply cyclical transformations
162
+ add_cyclical_features(df, "second", 60) # Seconds in a minute
163
+ add_cyclical_features(df, "minute", 60) # Minutes in an hour
164
+ add_cyclical_features(df, "minute", 30) # Minutes in half an hour
165
+ add_cyclical_features(df, "hour", 24) # Hours in a day
166
+
167
+ # Drop intermediate columns if not needed
168
+ df.drop(columns=["second", "minute", "hour"], inplace=True)
125
169
 
126
170
  df.drop(columns=seconds, inplace=True)
127
171
 
@@ -147,7 +191,19 @@ class DateTimeSearchKeyConverter:
147
191
  return pd.to_datetime(df[self.date_column], format=date_format)
148
192
  except ValueError:
149
193
  pass
150
- raise ValidationError(self.bundle.get("invalid_date_format").format(self.date_column))
194
+ try:
195
+ return pd.to_datetime(df[self.date_column])
196
+ except ValueError:
197
+ raise ValidationError(self.bundle.get("invalid_date_format").format(self.date_column))
198
+
199
+ def clean_old_dates(self, df: pd.DataFrame) -> pd.DataFrame:
200
+ condition = df[self.date_column] <= self.MIN_SUPPORTED_DATE_TS
201
+ old_subset = df[condition]
202
+ if len(old_subset) > 0:
203
+ self.has_old_dates = True
204
+ df.loc[condition, self.date_column] = None
205
+ self.logger.info(f"Set to None: {len(old_subset)} of {len(df)} rows because they are before 2000-01-01")
206
+ return df
151
207
 
152
208
 
153
209
  def is_time_series(df: pd.DataFrame, date_col: str) -> bool:
@@ -185,7 +241,10 @@ def is_time_series(df: pd.DataFrame, date_col: str) -> bool:
185
241
  def is_blocked_time_series(df: pd.DataFrame, date_col: str, search_keys: List[str]) -> bool:
186
242
  df = df.copy()
187
243
  seconds = "datetime_seconds"
188
- df[date_col] = pd.to_datetime(df[date_col])
244
+ if isinstance(df[date_col].dtype, pd.PeriodDtype):
245
+ df[date_col] = df[date_col].dt.to_timestamp()
246
+ else:
247
+ df[date_col] = pd.to_datetime(df[date_col])
189
248
  df[date_col] = df[date_col].dt.tz_localize(None)
190
249
  df[seconds] = (df[date_col] - df[date_col].dt.floor("D")).dt.seconds
191
250
 
@@ -231,24 +290,25 @@ def is_blocked_time_series(df: pd.DataFrame, date_col: str, search_keys: List[st
231
290
  return len(accumulated_changing_columns) <= 2
232
291
 
233
292
 
234
- def validate_dates_distribution(
235
- X: pd.DataFrame,
293
+ def is_dates_distribution_valid(
294
+ df: pd.DataFrame,
236
295
  search_keys: Dict[str, SearchKey],
237
- logger: Optional[logging.Logger] = None,
238
- bundle: Optional[ResourceBundle] = None,
239
- warning_counter: Optional[WarningCounter] = None,
240
- ):
241
- maybe_date_col = None
242
- for key, key_type in search_keys.items():
243
- if key_type in [SearchKey.DATE, SearchKey.DATETIME]:
244
- maybe_date_col = key
296
+ ) -> bool:
297
+ maybe_date_col = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
298
+
299
+ if EVAL_SET_INDEX in df.columns:
300
+ X = df.query(f"{EVAL_SET_INDEX} == 0")
301
+ else:
302
+ X = df
245
303
 
246
304
  if maybe_date_col is None:
247
305
  for col in X.columns:
248
306
  if col in search_keys:
249
307
  continue
250
308
  try:
251
- if pd.__version__ >= "2.0.0":
309
+ if isinstance(X[col].dtype, pd.PeriodDtype):
310
+ pass
311
+ elif pd.__version__ >= "2.0.0":
252
312
  # Format mixed to avoid massive warnings
253
313
  pd.to_datetime(X[col], format="mixed")
254
314
  else:
@@ -261,7 +321,9 @@ def validate_dates_distribution(
261
321
  if maybe_date_col is None:
262
322
  return
263
323
 
264
- if pd.__version__ >= "2.0.0":
324
+ if isinstance(X[maybe_date_col].dtype, pd.PeriodDtype):
325
+ dates = X[maybe_date_col].dt.to_timestamp().dt.date
326
+ elif pd.__version__ >= "2.0.0":
265
327
  dates = pd.to_datetime(X[maybe_date_col], format="mixed").dt.date
266
328
  else:
267
329
  dates = pd.to_datetime(X[maybe_date_col]).dt.date
@@ -272,13 +334,4 @@ def validate_dates_distribution(
272
334
  date_counts_2 = date_counts[round(len(date_counts) / 2) :]
273
335
  ratio = date_counts_2.mean() / date_counts_1.mean()
274
336
 
275
- if ratio > 1.2 or ratio < 0.8:
276
- if warning_counter is not None:
277
- warning_counter.increment()
278
- if logger is None:
279
- logger = logging.getLogger("muted_logger")
280
- logger.setLevel("FATAL")
281
- bundle = bundle or get_custom_bundle()
282
- msg = bundle.get("x_unstable_by_date")
283
- print(msg)
284
- logger.warning(msg)
337
+ return ratio >= 0.8 and ratio <= 1.2
@@ -1,10 +1,19 @@
1
+ import logging
1
2
  from logging import Logger
2
- from typing import Dict, List, Optional, Union
3
+ from typing import Dict, List, Optional, Tuple, Union
3
4
 
4
5
  import pandas as pd
5
6
 
6
- from upgini.metadata import EVAL_SET_INDEX, SORT_ID, SYSTEM_RECORD_ID, TARGET, ModelTaskType, SearchKey
7
- from upgini.resource_bundle import ResourceBundle
7
+ from upgini.metadata import (
8
+ ENTITY_SYSTEM_RECORD_ID,
9
+ EVAL_SET_INDEX,
10
+ SORT_ID,
11
+ SYSTEM_RECORD_ID,
12
+ TARGET,
13
+ ModelTaskType,
14
+ SearchKey,
15
+ )
16
+ from upgini.resource_bundle import ResourceBundle, get_custom_bundle
8
17
  from upgini.utils.datetime_utils import DateTimeSearchKeyConverter
9
18
  from upgini.utils.target_utils import define_task
10
19
 
@@ -14,17 +23,19 @@ def remove_fintech_duplicates(
14
23
  search_keys: Dict[str, SearchKey],
15
24
  date_format: Optional[str] = None,
16
25
  logger: Optional[Logger] = None,
17
- silent=False,
18
26
  bundle: ResourceBundle = None,
19
- ) -> pd.DataFrame:
20
- # Base checks
27
+ ) -> Tuple[pd.DataFrame, Optional[List[str]]]:
28
+ # Initial checks for target type and date column
29
+ bundle = bundle or get_custom_bundle()
30
+ if logger is None:
31
+ logger = logging.getLogger()
32
+ logger.setLevel(logging.FATAL)
21
33
  date_col = _get_column_by_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
22
34
  if define_task(df[TARGET], date_col is not None, silent=True) != ModelTaskType.BINARY:
23
- return df
35
+ return df, []
24
36
 
25
- date_col = _get_column_by_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
26
37
  if date_col is None:
27
- return df
38
+ return df, []
28
39
 
29
40
  personal_cols = []
30
41
  phone_col = _get_column_by_key(search_keys, SearchKey.PHONE)
@@ -37,116 +48,133 @@ def remove_fintech_duplicates(
37
48
  if hem_col:
38
49
  personal_cols.append(hem_col)
39
50
  if len(personal_cols) == 0:
40
- return df
41
-
42
- sub_df = df[personal_cols + [date_col, TARGET]]
43
-
44
- # Fast check for duplicates by personal keys
45
- if not sub_df[personal_cols].duplicated().any():
46
- return df
47
-
48
- grouped_by_personal_cols = sub_df.groupby(personal_cols, group_keys=False)
49
-
50
- # counts of diff dates by set of personal keys
51
- uniques = grouped_by_personal_cols[date_col].nunique()
52
- total = len(uniques)
53
- diff_dates = len(uniques[uniques > 1])
54
- if diff_dates / total >= 0.6:
55
- return df
56
-
57
- # Additional checks
58
-
59
- duplicates = sub_df.duplicated(personal_cols, keep=False)
60
- duplicate_rows = sub_df[duplicates]
61
- if len(duplicate_rows) == 0:
62
- return df
63
-
64
- # if there is no different target values in personal keys duplicate rows
65
- nonunique_target_groups = grouped_by_personal_cols[TARGET].nunique() > 1
66
- if nonunique_target_groups.sum() == 0:
67
- return df
68
-
69
- def has_diff_target_within_60_days(rows):
70
- rows = rows.sort_values(by=date_col)
71
- return len(rows[rows[TARGET].ne(rows[TARGET].shift()) & (rows[date_col].diff() < 60 * 24 * 60 * 60 * 1000)]) > 0
72
-
73
- nonunique_target_rows = nonunique_target_groups[nonunique_target_groups].reset_index().drop(columns=TARGET)
74
- sub_df = pd.merge(sub_df, nonunique_target_rows, on=personal_cols)
75
-
76
- sub_df = DateTimeSearchKeyConverter(date_col, date_format=date_format, logger=logger, bundle=bundle).convert(sub_df)
77
- grouped_by_personal_cols = sub_df.groupby(personal_cols, group_keys=False)
78
- rows_with_diff_target = grouped_by_personal_cols.filter(has_diff_target_within_60_days)
79
- if len(rows_with_diff_target) > 0:
80
- unique_keys_to_delete = rows_with_diff_target[personal_cols].drop_duplicates()
81
- if EVAL_SET_INDEX not in df.columns:
82
- rows_to_remove = pd.merge(df.reset_index(), unique_keys_to_delete, on=personal_cols)
83
- rows_to_remove = rows_to_remove.set_index(df.index.name or "index")
84
- perc = len(rows_to_remove) * 100 / len(df)
85
- msg = bundle.get("dataset_train_diff_target_duplicates_fintech").format(
86
- perc, len(rows_to_remove), rows_to_remove.index.to_list()
87
- )
88
- if not silent:
89
- print(msg)
90
- if logger:
91
- logger.warning(msg)
92
- logger.info(f"Dataset shape before clean fintech duplicates: {df.shape}")
93
- df = df[~df.index.isin(rows_to_remove.index)]
94
- logger.info(f"Dataset shape after clean fintech duplicates: {df.shape}")
95
- else:
96
- # Indices in train and eval_set can be the same so we remove rows from them separately
97
- train = df.query(f"{EVAL_SET_INDEX} == 0")
98
- train_rows_to_remove = pd.merge(train.reset_index(), unique_keys_to_delete, on=personal_cols)
99
- train_rows_to_remove = train_rows_to_remove.set_index(train.index.name or "index")
100
- train_perc = len(train_rows_to_remove) * 100 / len(train)
101
- msg = bundle.get("dataset_train_diff_target_duplicates_fintech").format(
102
- train_perc, len(train_rows_to_remove), train_rows_to_remove.index.to_list()
51
+ return df, []
52
+
53
+ # Splitting into train and eval_set parts
54
+ if EVAL_SET_INDEX in df.columns:
55
+ train_df = df[df[EVAL_SET_INDEX] == 0]
56
+ eval_dfs = [df[df[EVAL_SET_INDEX] == idx] for idx in df[EVAL_SET_INDEX].unique() if idx != 0]
57
+ else:
58
+ train_df = df
59
+ eval_dfs = []
60
+
61
+ warning_messages = []
62
+
63
+ def process_df(segment_df: pd.DataFrame, eval_index=0) -> Tuple[pd.DataFrame, Optional[str]]:
64
+ """Process a subset of the dataset to remove duplicates based on personal keys."""
65
+ # Fast check for duplicates based on personal keys
66
+ if not segment_df[personal_cols].duplicated().any():
67
+ return segment_df, None
68
+
69
+ sub_df = segment_df[personal_cols + [date_col, TARGET]].copy()
70
+
71
+ # Group by personal columns to check for unique dates
72
+ grouped_by_personal_cols = sub_df.groupby(personal_cols, group_keys=False)
73
+
74
+ # Checking for different dates by the same personal keys
75
+ uniques = grouped_by_personal_cols[date_col].nunique()
76
+ total = len(uniques)
77
+ diff_dates = len(uniques[uniques > 1])
78
+ if diff_dates / total >= 0.6:
79
+ return segment_df, None
80
+
81
+ # Check for duplicate rows
82
+ duplicates = sub_df.duplicated(personal_cols, keep=False)
83
+ duplicate_rows = sub_df[duplicates]
84
+ if len(duplicate_rows) == 0:
85
+ return segment_df, None
86
+
87
+ # Check if there are different target values for the same personal keys
88
+ nonunique_target_groups = grouped_by_personal_cols[TARGET].nunique() > 1
89
+ if nonunique_target_groups.sum() == 0:
90
+ return segment_df, None
91
+
92
+ # Helper function to check if there are different target values within 60 days
93
+ def has_diff_target_within_60_days(rows: pd.DataFrame):
94
+ rows = rows.sort_values(by=date_col)
95
+ return (
96
+ len(rows[rows[TARGET].ne(rows[TARGET].shift()) & (rows[date_col].diff() < 60 * 24 * 60 * 60 * 1000)])
97
+ > 0
103
98
  )
104
- if not silent:
105
- print(msg)
106
- if logger:
107
- logger.warning(msg)
108
- logger.info(f"Train dataset shape before clean fintech duplicates: {train.shape}")
109
- train = train[~train.index.isin(train_rows_to_remove.index)]
110
- logger.info(f"Train dataset shape after clean fintech duplicates: {train.shape}")
111
-
112
- evals = [df.query(f"{EVAL_SET_INDEX} == {i}") for i in df[EVAL_SET_INDEX].unique() if i != 0]
113
- new_evals = []
114
- for i, eval in enumerate(evals):
115
- eval_rows_to_remove = pd.merge(eval.reset_index(), unique_keys_to_delete, on=personal_cols)
116
- eval_rows_to_remove = eval_rows_to_remove.set_index(eval.index.name or "index")
117
- eval_perc = len(eval_rows_to_remove) * 100 / len(eval)
99
+
100
+ # Filter rows with different target values within 60 days
101
+ nonunique_target_rows = nonunique_target_groups[nonunique_target_groups].reset_index().drop(columns=TARGET)
102
+ sub_df = pd.merge(sub_df, nonunique_target_rows, on=personal_cols)
103
+
104
+ # Convert date columns for further checks
105
+ sub_df = DateTimeSearchKeyConverter(date_col, date_format=date_format, logger=logger, bundle=bundle).convert(
106
+ sub_df
107
+ )
108
+ grouped_by_personal_cols = sub_df.groupby(personal_cols, group_keys=False)
109
+ rows_with_diff_target = grouped_by_personal_cols.filter(has_diff_target_within_60_days)
110
+
111
+ if len(rows_with_diff_target) > 0:
112
+ unique_keys_to_delete = rows_with_diff_target[personal_cols].drop_duplicates()
113
+ rows_to_remove = pd.merge(segment_df.reset_index(), unique_keys_to_delete, on=personal_cols)
114
+ rows_to_remove = rows_to_remove.set_index(segment_df.index.name or "index")
115
+ perc = len(rows_to_remove) * 100 / len(segment_df)
116
+ if eval_index == 0:
117
+ msg = bundle.get("dataset_train_diff_target_duplicates_fintech").format(
118
+ perc, len(rows_to_remove), rows_to_remove.index.to_list()
119
+ )
120
+ else:
118
121
  msg = bundle.get("dataset_eval_diff_target_duplicates_fintech").format(
119
- eval_perc, len(eval_rows_to_remove), i + 1, eval_rows_to_remove.index.to_list()
122
+ perc, len(rows_to_remove), eval_index, rows_to_remove.index.to_list()
120
123
  )
121
- if not silent:
122
- print(msg)
123
- if logger:
124
- logger.warning(msg)
125
- logger.info(f"Eval {i + 1} dataset shape before clean fintech duplicates: {eval.shape}")
126
- eval = eval[~eval.index.isin(eval_rows_to_remove.index)]
127
- logger.info(f"Eval {i + 1} dataset shape after clean fintech duplicates: {eval.shape}")
128
- new_evals.append(eval)
129
-
130
- logger.info(f"Dataset shape before clean fintech duplicates: {df.shape}")
131
- df = pd.concat([train] + new_evals)
132
- logger.info(f"Dataset shape after clean fintech duplicates: {df.shape}")
133
- return df
124
+ return segment_df[~segment_df.index.isin(rows_to_remove.index)], msg
125
+ return segment_df, None
126
+
127
+ # Process the train part separately
128
+ logger.info(f"Train dataset shape before clean fintech duplicates: {train_df.shape}")
129
+ train_df, train_warning = process_df(train_df)
130
+ if train_warning:
131
+ warning_messages.append(train_warning)
132
+ logger.info(f"Train dataset shape after clean fintech duplicates: {train_df.shape}")
133
+
134
+ # Process each eval_set part separately
135
+ new_eval_dfs = []
136
+ for i, eval_df in enumerate(eval_dfs, 1):
137
+ logger.info(f"Eval {i} dataset shape before clean fintech duplicates: {eval_df.shape}")
138
+ cleaned_eval_df, eval_warning = process_df(eval_df, i)
139
+ if eval_warning:
140
+ warning_messages.append(eval_warning)
141
+ logger.info(f"Eval {i} dataset shape after clean fintech duplicates: {cleaned_eval_df.shape}")
142
+ new_eval_dfs.append(cleaned_eval_df)
143
+
144
+ # Combine the processed train and eval parts back into one dataset
145
+ logger.info(f"Dataset shape before clean fintech duplicates: {df.shape}")
146
+ if new_eval_dfs:
147
+ df = pd.concat([train_df] + new_eval_dfs)
148
+ else:
149
+ df = train_df
150
+ logger.info(f"Dataset shape after clean fintech duplicates: {df.shape}")
151
+
152
+ return df, warning_messages
134
153
 
135
154
 
136
155
  def clean_full_duplicates(
137
- df: pd.DataFrame, logger: Optional[Logger] = None, silent=False, bundle: ResourceBundle = None
138
- ) -> pd.DataFrame:
156
+ df: pd.DataFrame, logger: Optional[Logger] = None, bundle: Optional[ResourceBundle] = None
157
+ ) -> Tuple[pd.DataFrame, Optional[str]]:
158
+ if logger is None:
159
+ logger = logging.getLogger()
160
+ logger.setLevel(logging.FATAL)
161
+ if bundle is None:
162
+ bundle = get_custom_bundle()
163
+
139
164
  nrows = len(df)
140
165
  if nrows == 0:
141
- return df
166
+ return df, None
142
167
  # Remove full duplicates (exclude system_record_id, sort_id and eval_set_index)
143
168
  unique_columns = df.columns.tolist()
144
169
  if SYSTEM_RECORD_ID in unique_columns:
145
170
  unique_columns.remove(SYSTEM_RECORD_ID)
171
+ if ENTITY_SYSTEM_RECORD_ID in unique_columns:
172
+ unique_columns.remove(ENTITY_SYSTEM_RECORD_ID)
146
173
  if SORT_ID in unique_columns:
147
174
  unique_columns.remove(SORT_ID)
148
175
  if EVAL_SET_INDEX in unique_columns:
149
176
  unique_columns.remove(EVAL_SET_INDEX)
177
+
150
178
  logger.info(f"Dataset shape before clean duplicates: {df.shape}")
151
179
  # Train segment goes first so if duplicates are found in train and eval set
152
180
  # then we keep unique rows in train segment
@@ -155,11 +183,9 @@ def clean_full_duplicates(
155
183
  nrows_after_full_dedup = len(df)
156
184
  share_full_dedup = 100 * (1 - nrows_after_full_dedup / nrows)
157
185
  if share_full_dedup > 0:
158
- msg = bundle.get("dataset_full_duplicates").format(share_full_dedup)
159
- logger.warning(msg)
160
- # if not silent_mode:
161
- # print(msg)
162
- # self.warning_counter.increment()
186
+ logger.warning(bundle.get("dataset_full_duplicates").format(share_full_dedup))
187
+
188
+ msg = None
163
189
  if TARGET in df.columns:
164
190
  unique_columns.remove(TARGET)
165
191
  marked_duplicates = df.duplicated(subset=unique_columns, keep=False)
@@ -170,13 +196,10 @@ def clean_full_duplicates(
170
196
  share_tgt_dedup = 100 * num_dup_rows / nrows_after_full_dedup
171
197
 
172
198
  msg = bundle.get("dataset_diff_target_duplicates").format(share_tgt_dedup, num_dup_rows, dups_indices)
173
- logger.warning(msg)
174
- if not silent:
175
- print(msg)
176
199
  df = df.drop_duplicates(subset=unique_columns, keep=False)
177
200
  logger.info(f"Dataset shape after clean invalid target duplicates: {df.shape}")
178
201
 
179
- return df
202
+ return df, msg
180
203
 
181
204
 
182
205
  def _get_column_by_key(search_keys: Dict[str, SearchKey], keys: Union[SearchKey, List[SearchKey]]) -> Optional[str]: