upgini 1.2.12__tar.gz → 1.2.13a1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

Files changed (66) hide show
  1. {upgini-1.2.12 → upgini-1.2.13a1}/PKG-INFO +1 -1
  2. upgini-1.2.13a1/src/upgini/__about__.py +1 -0
  3. {upgini-1.2.12 → upgini-1.2.13a1}/src/upgini/features_enricher.py +1 -0
  4. {upgini-1.2.12 → upgini-1.2.13a1}/src/upgini/metrics.py +1 -1
  5. {upgini-1.2.12 → upgini-1.2.13a1}/src/upgini/resource_bundle/strings.properties +1 -1
  6. upgini-1.2.13a1/src/upgini/utils/deduplicate_utils.py +200 -0
  7. upgini-1.2.12/src/upgini/__about__.py +0 -1
  8. upgini-1.2.12/src/upgini/utils/deduplicate_utils.py +0 -195
  9. {upgini-1.2.12 → upgini-1.2.13a1}/.gitignore +0 -0
  10. {upgini-1.2.12 → upgini-1.2.13a1}/LICENSE +0 -0
  11. {upgini-1.2.12 → upgini-1.2.13a1}/README.md +0 -0
  12. {upgini-1.2.12 → upgini-1.2.13a1}/pyproject.toml +0 -0
  13. {upgini-1.2.12 → upgini-1.2.13a1}/src/upgini/__init__.py +0 -0
  14. {upgini-1.2.12 → upgini-1.2.13a1}/src/upgini/ads.py +0 -0
  15. {upgini-1.2.12 → upgini-1.2.13a1}/src/upgini/ads_management/__init__.py +0 -0
  16. {upgini-1.2.12 → upgini-1.2.13a1}/src/upgini/ads_management/ads_manager.py +0 -0
  17. {upgini-1.2.12 → upgini-1.2.13a1}/src/upgini/autofe/__init__.py +0 -0
  18. {upgini-1.2.12 → upgini-1.2.13a1}/src/upgini/autofe/all_operands.py +0 -0
  19. {upgini-1.2.12 → upgini-1.2.13a1}/src/upgini/autofe/binary.py +0 -0
  20. {upgini-1.2.12 → upgini-1.2.13a1}/src/upgini/autofe/date.py +0 -0
  21. {upgini-1.2.12 → upgini-1.2.13a1}/src/upgini/autofe/feature.py +0 -0
  22. {upgini-1.2.12 → upgini-1.2.13a1}/src/upgini/autofe/groupby.py +0 -0
  23. {upgini-1.2.12 → upgini-1.2.13a1}/src/upgini/autofe/operand.py +0 -0
  24. {upgini-1.2.12 → upgini-1.2.13a1}/src/upgini/autofe/unary.py +0 -0
  25. {upgini-1.2.12 → upgini-1.2.13a1}/src/upgini/autofe/vector.py +0 -0
  26. {upgini-1.2.12 → upgini-1.2.13a1}/src/upgini/data_source/__init__.py +0 -0
  27. {upgini-1.2.12 → upgini-1.2.13a1}/src/upgini/data_source/data_source_publisher.py +0 -0
  28. {upgini-1.2.12 → upgini-1.2.13a1}/src/upgini/dataset.py +0 -0
  29. {upgini-1.2.12 → upgini-1.2.13a1}/src/upgini/errors.py +0 -0
  30. {upgini-1.2.12 → upgini-1.2.13a1}/src/upgini/http.py +0 -0
  31. {upgini-1.2.12 → upgini-1.2.13a1}/src/upgini/lazy_import.py +0 -0
  32. {upgini-1.2.12 → upgini-1.2.13a1}/src/upgini/mdc/__init__.py +0 -0
  33. {upgini-1.2.12 → upgini-1.2.13a1}/src/upgini/mdc/context.py +0 -0
  34. {upgini-1.2.12 → upgini-1.2.13a1}/src/upgini/metadata.py +0 -0
  35. {upgini-1.2.12 → upgini-1.2.13a1}/src/upgini/normalizer/__init__.py +0 -0
  36. {upgini-1.2.12 → upgini-1.2.13a1}/src/upgini/normalizer/normalize_utils.py +0 -0
  37. {upgini-1.2.12 → upgini-1.2.13a1}/src/upgini/resource_bundle/__init__.py +0 -0
  38. {upgini-1.2.12 → upgini-1.2.13a1}/src/upgini/resource_bundle/exceptions.py +0 -0
  39. {upgini-1.2.12 → upgini-1.2.13a1}/src/upgini/resource_bundle/strings_widget.properties +0 -0
  40. {upgini-1.2.12 → upgini-1.2.13a1}/src/upgini/sampler/__init__.py +0 -0
  41. {upgini-1.2.12 → upgini-1.2.13a1}/src/upgini/sampler/base.py +0 -0
  42. {upgini-1.2.12 → upgini-1.2.13a1}/src/upgini/sampler/random_under_sampler.py +0 -0
  43. {upgini-1.2.12 → upgini-1.2.13a1}/src/upgini/sampler/utils.py +0 -0
  44. {upgini-1.2.12 → upgini-1.2.13a1}/src/upgini/search_task.py +0 -0
  45. {upgini-1.2.12 → upgini-1.2.13a1}/src/upgini/spinner.py +0 -0
  46. {upgini-1.2.12 → upgini-1.2.13a1}/src/upgini/utils/__init__.py +0 -0
  47. {upgini-1.2.12 → upgini-1.2.13a1}/src/upgini/utils/base_search_key_detector.py +0 -0
  48. {upgini-1.2.12 → upgini-1.2.13a1}/src/upgini/utils/blocked_time_series.py +0 -0
  49. {upgini-1.2.12 → upgini-1.2.13a1}/src/upgini/utils/country_utils.py +0 -0
  50. {upgini-1.2.12 → upgini-1.2.13a1}/src/upgini/utils/custom_loss_utils.py +0 -0
  51. {upgini-1.2.12 → upgini-1.2.13a1}/src/upgini/utils/cv_utils.py +0 -0
  52. {upgini-1.2.12 → upgini-1.2.13a1}/src/upgini/utils/datetime_utils.py +0 -0
  53. {upgini-1.2.12 → upgini-1.2.13a1}/src/upgini/utils/display_utils.py +0 -0
  54. {upgini-1.2.12 → upgini-1.2.13a1}/src/upgini/utils/email_utils.py +0 -0
  55. {upgini-1.2.12 → upgini-1.2.13a1}/src/upgini/utils/fallback_progress_bar.py +0 -0
  56. {upgini-1.2.12 → upgini-1.2.13a1}/src/upgini/utils/features_validator.py +0 -0
  57. {upgini-1.2.12 → upgini-1.2.13a1}/src/upgini/utils/format.py +0 -0
  58. {upgini-1.2.12 → upgini-1.2.13a1}/src/upgini/utils/ip_utils.py +0 -0
  59. {upgini-1.2.12 → upgini-1.2.13a1}/src/upgini/utils/phone_utils.py +0 -0
  60. {upgini-1.2.12 → upgini-1.2.13a1}/src/upgini/utils/postal_code_utils.py +0 -0
  61. {upgini-1.2.12 → upgini-1.2.13a1}/src/upgini/utils/progress_bar.py +0 -0
  62. {upgini-1.2.12 → upgini-1.2.13a1}/src/upgini/utils/sklearn_ext.py +0 -0
  63. {upgini-1.2.12 → upgini-1.2.13a1}/src/upgini/utils/target_utils.py +0 -0
  64. {upgini-1.2.12 → upgini-1.2.13a1}/src/upgini/utils/track_info.py +0 -0
  65. {upgini-1.2.12 → upgini-1.2.13a1}/src/upgini/utils/warning_counter.py +0 -0
  66. {upgini-1.2.12 → upgini-1.2.13a1}/src/upgini/version_validator.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: upgini
3
- Version: 1.2.12
3
+ Version: 1.2.13a1
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -0,0 +1 @@
1
+ __version__ = "1.2.13a1"
@@ -3322,6 +3322,7 @@ class FeaturesEnricher(TransformerMixin):
3322
3322
  # index overrites from result_features
3323
3323
  original_index_name = df_with_original_index.index.name
3324
3324
  df_with_original_index = df_with_original_index.reset_index()
3325
+ # TODO drop system_record_id before merge
3325
3326
  result_features = pd.merge(
3326
3327
  df_with_original_index,
3327
3328
  result_features,
@@ -526,7 +526,7 @@ class CatBoostWrapper(EstimatorWrapper):
526
526
  emb_name = "__grouped_embeddings"
527
527
  df = df.copy()
528
528
  df[self.emb_features] = df[self.emb_features].fillna(0.0)
529
- df[emb_name] = df[self.emb_features].values.tolist()
529
+ df[emb_name] = pd.Series(df[self.emb_features].values.tolist())
530
530
  df = df.drop(columns=self.emb_features)
531
531
 
532
532
  return df, [emb_name]
@@ -96,7 +96,7 @@ invalid_date_format=Failed to parse date in column `{}`. Try to pass explicit da
96
96
  unsupported_date_type=Unsupported type of date column `{}`. Convert to datetime please.
97
97
  invalid_postal_code=All values of POSTAL_CODE column `{}` are invalid
98
98
  invalid_country=All values of COUNTRY column `{}` are invalid
99
- invalid_ip=All values of IPv4 column `{}` are invalid
99
+ invalid_ip=All values of IP column `{}` are invalid
100
100
  # X and y validation
101
101
  unsupported_x_type=Unsupported type of X: {}. Use pandas.DataFrame, pandas.Series or numpy.ndarray or list
102
102
  x_contains_dup_columns=X contains duplicate column names. Please rename or drop duplicates
@@ -0,0 +1,200 @@
1
+ from logging import Logger
2
+ from typing import Dict, List, Optional, Union
3
+
4
+ import pandas as pd
5
+
6
+ from upgini.metadata import (
7
+ ENTITY_SYSTEM_RECORD_ID,
8
+ EVAL_SET_INDEX,
9
+ SORT_ID,
10
+ SYSTEM_RECORD_ID,
11
+ TARGET,
12
+ ModelTaskType,
13
+ SearchKey,
14
+ )
15
+ from upgini.resource_bundle import ResourceBundle
16
+ from upgini.utils.datetime_utils import DateTimeSearchKeyConverter
17
+ from upgini.utils.target_utils import define_task
18
+
19
+
20
+ def remove_fintech_duplicates(
21
+ df: pd.DataFrame,
22
+ search_keys: Dict[str, SearchKey],
23
+ date_format: Optional[str] = None,
24
+ logger: Optional[Logger] = None,
25
+ silent=False,
26
+ bundle: ResourceBundle = None,
27
+ ) -> pd.DataFrame:
28
+ # Initial checks for target type and date column
29
+ date_col = _get_column_by_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
30
+ if define_task(df[TARGET], date_col is not None, silent=True) != ModelTaskType.BINARY:
31
+ return df
32
+
33
+ if date_col is None:
34
+ return df
35
+
36
+ personal_cols = []
37
+ phone_col = _get_column_by_key(search_keys, SearchKey.PHONE)
38
+ if phone_col:
39
+ personal_cols.append(phone_col)
40
+ email_col = _get_column_by_key(search_keys, SearchKey.EMAIL)
41
+ if email_col:
42
+ personal_cols.append(email_col)
43
+ hem_col = _get_column_by_key(search_keys, SearchKey.HEM)
44
+ if hem_col:
45
+ personal_cols.append(hem_col)
46
+ if len(personal_cols) == 0:
47
+ return df
48
+
49
+ # Splitting into train and eval_set parts
50
+ if EVAL_SET_INDEX in df.columns:
51
+ train_df = df[df[EVAL_SET_INDEX] == 0]
52
+ eval_dfs = [df[df[EVAL_SET_INDEX] == idx] for idx in df[EVAL_SET_INDEX].unique() if idx != 0]
53
+ else:
54
+ train_df = df
55
+ eval_dfs = []
56
+
57
+ def process_df(segment_df: pd.DataFrame, eval_index=0) -> pd.DataFrame:
58
+ """Process a subset of the dataset to remove duplicates based on personal keys."""
59
+ # Fast check for duplicates based on personal keys
60
+ if not segment_df[personal_cols].duplicated().any():
61
+ return segment_df
62
+
63
+ sub_df = segment_df[personal_cols + [date_col, TARGET]].copy()
64
+
65
+ # Group by personal columns to check for unique dates
66
+ grouped_by_personal_cols = sub_df.groupby(personal_cols, group_keys=False)
67
+
68
+ # Checking for different dates by the same personal keys
69
+ uniques = grouped_by_personal_cols[date_col].nunique()
70
+ total = len(uniques)
71
+ diff_dates = len(uniques[uniques > 1])
72
+ if diff_dates / total >= 0.6:
73
+ return segment_df
74
+
75
+ # Check for duplicate rows
76
+ duplicates = sub_df.duplicated(personal_cols, keep=False)
77
+ duplicate_rows = sub_df[duplicates]
78
+ if len(duplicate_rows) == 0:
79
+ return segment_df
80
+
81
+ # Check if there are different target values for the same personal keys
82
+ nonunique_target_groups = grouped_by_personal_cols[TARGET].nunique() > 1
83
+ if nonunique_target_groups.sum() == 0:
84
+ return segment_df
85
+
86
+ # Helper function to check if there are different target values within 60 days
87
+ def has_diff_target_within_60_days(rows: pd.DataFrame):
88
+ rows = rows.sort_values(by=date_col)
89
+ return (
90
+ len(rows[rows[TARGET].ne(rows[TARGET].shift()) & (rows[date_col].diff() < 60 * 24 * 60 * 60 * 1000)])
91
+ > 0
92
+ )
93
+
94
+ # Filter rows with different target values within 60 days
95
+ nonunique_target_rows = nonunique_target_groups[nonunique_target_groups].reset_index().drop(columns=TARGET)
96
+ sub_df = pd.merge(sub_df, nonunique_target_rows, on=personal_cols)
97
+
98
+ # Convert date columns for further checks
99
+ sub_df = DateTimeSearchKeyConverter(date_col, date_format=date_format, logger=logger, bundle=bundle).convert(
100
+ sub_df
101
+ )
102
+ grouped_by_personal_cols = sub_df.groupby(personal_cols, group_keys=False)
103
+ rows_with_diff_target = grouped_by_personal_cols.filter(has_diff_target_within_60_days)
104
+
105
+ if len(rows_with_diff_target) > 0:
106
+ unique_keys_to_delete = rows_with_diff_target[personal_cols].drop_duplicates()
107
+ rows_to_remove = pd.merge(segment_df.reset_index(), unique_keys_to_delete, on=personal_cols)
108
+ rows_to_remove = rows_to_remove.set_index(segment_df.index.name or "index")
109
+ perc = len(rows_to_remove) * 100 / len(segment_df)
110
+ if eval_index == 0:
111
+ msg = bundle.get("dataset_train_diff_target_duplicates_fintech").format(
112
+ perc, len(rows_to_remove), rows_to_remove.index.to_list()
113
+ )
114
+ else:
115
+ msg = bundle.get("dataset_eval_diff_target_duplicates_fintech").format(
116
+ perc, len(rows_to_remove), eval_index, rows_to_remove.index.to_list()
117
+ )
118
+ if not silent:
119
+ print(msg)
120
+ if logger:
121
+ logger.warning(msg)
122
+ return segment_df[~segment_df.index.isin(rows_to_remove.index)]
123
+ return segment_df
124
+
125
+ # Process the train part separately
126
+ logger.info(f"Train dataset shape before clean fintech duplicates: {train_df.shape}")
127
+ train_df = process_df(train_df)
128
+ logger.info(f"Train dataset shape after clean fintech duplicates: {train_df.shape}")
129
+
130
+ # Process each eval_set part separately
131
+ new_eval_dfs = []
132
+ for i, eval_df in enumerate(eval_dfs, 1):
133
+ logger.info(f"Eval {i} dataset shape before clean fintech duplicates: {eval_df.shape}")
134
+ cleaned_eval_df = process_df(eval_df, i)
135
+ logger.info(f"Eval {i} dataset shape after clean fintech duplicates: {cleaned_eval_df.shape}")
136
+ new_eval_dfs.append(cleaned_eval_df)
137
+
138
+ # Combine the processed train and eval parts back into one dataset
139
+ logger.info(f"Dataset shape before clean fintech duplicates: {df.shape}")
140
+ if new_eval_dfs:
141
+ df = pd.concat([train_df] + new_eval_dfs)
142
+ else:
143
+ df = train_df
144
+ logger.info(f"Dataset shape after clean fintech duplicates: {df.shape}")
145
+
146
+ return df
147
+
148
+
149
+ def clean_full_duplicates(
150
+ df: pd.DataFrame, logger: Optional[Logger] = None, silent=False, bundle: ResourceBundle = None
151
+ ) -> pd.DataFrame:
152
+ nrows = len(df)
153
+ if nrows == 0:
154
+ return df
155
+ # Remove full duplicates (exclude system_record_id, sort_id and eval_set_index)
156
+ unique_columns = df.columns.tolist()
157
+ if SYSTEM_RECORD_ID in unique_columns:
158
+ unique_columns.remove(SYSTEM_RECORD_ID)
159
+ if ENTITY_SYSTEM_RECORD_ID in unique_columns:
160
+ unique_columns.remove(ENTITY_SYSTEM_RECORD_ID)
161
+ if SORT_ID in unique_columns:
162
+ unique_columns.remove(SORT_ID)
163
+ if EVAL_SET_INDEX in unique_columns:
164
+ unique_columns.remove(EVAL_SET_INDEX)
165
+ logger.info(f"Dataset shape before clean duplicates: {df.shape}")
166
+ # Train segment goes first so if duplicates are found in train and eval set
167
+ # then we keep unique rows in train segment
168
+ df = df.drop_duplicates(subset=unique_columns, keep="first")
169
+ logger.info(f"Dataset shape after clean duplicates: {df.shape}")
170
+ nrows_after_full_dedup = len(df)
171
+ share_full_dedup = 100 * (1 - nrows_after_full_dedup / nrows)
172
+ if share_full_dedup > 0:
173
+ msg = bundle.get("dataset_full_duplicates").format(share_full_dedup)
174
+ logger.warning(msg)
175
+ # if not silent_mode:
176
+ # print(msg)
177
+ # self.warning_counter.increment()
178
+ if TARGET in df.columns:
179
+ unique_columns.remove(TARGET)
180
+ marked_duplicates = df.duplicated(subset=unique_columns, keep=False)
181
+ if marked_duplicates.sum() > 0:
182
+ dups_indices = df[marked_duplicates].index.to_list()
183
+ nrows_after_tgt_dedup = len(df.drop_duplicates(subset=unique_columns, keep=False))
184
+ num_dup_rows = nrows_after_full_dedup - nrows_after_tgt_dedup
185
+ share_tgt_dedup = 100 * num_dup_rows / nrows_after_full_dedup
186
+
187
+ msg = bundle.get("dataset_diff_target_duplicates").format(share_tgt_dedup, num_dup_rows, dups_indices)
188
+ logger.warning(msg)
189
+ if not silent:
190
+ print(msg)
191
+ df = df.drop_duplicates(subset=unique_columns, keep=False)
192
+ logger.info(f"Dataset shape after clean invalid target duplicates: {df.shape}")
193
+
194
+ return df
195
+
196
+
197
+ def _get_column_by_key(search_keys: Dict[str, SearchKey], keys: Union[SearchKey, List[SearchKey]]) -> Optional[str]:
198
+ for col, key_type in search_keys.items():
199
+ if (isinstance(keys, list) and key_type in keys) or key_type == keys:
200
+ return col
@@ -1 +0,0 @@
1
- __version__ = "1.2.12"
@@ -1,195 +0,0 @@
1
- from logging import Logger
2
- from typing import Dict, List, Optional, Union
3
-
4
- import pandas as pd
5
-
6
- from upgini.metadata import (
7
- ENTITY_SYSTEM_RECORD_ID,
8
- EVAL_SET_INDEX,
9
- SORT_ID,
10
- SYSTEM_RECORD_ID,
11
- TARGET,
12
- ModelTaskType,
13
- SearchKey,
14
- )
15
- from upgini.resource_bundle import ResourceBundle
16
- from upgini.utils.datetime_utils import DateTimeSearchKeyConverter
17
- from upgini.utils.target_utils import define_task
18
-
19
-
20
- def remove_fintech_duplicates(
21
- df: pd.DataFrame,
22
- search_keys: Dict[str, SearchKey],
23
- date_format: Optional[str] = None,
24
- logger: Optional[Logger] = None,
25
- silent=False,
26
- bundle: ResourceBundle = None,
27
- ) -> pd.DataFrame:
28
- # Base checks
29
- date_col = _get_column_by_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
30
- if define_task(df[TARGET], date_col is not None, silent=True) != ModelTaskType.BINARY:
31
- return df
32
-
33
- date_col = _get_column_by_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
34
- if date_col is None:
35
- return df
36
-
37
- personal_cols = []
38
- phone_col = _get_column_by_key(search_keys, SearchKey.PHONE)
39
- if phone_col:
40
- personal_cols.append(phone_col)
41
- email_col = _get_column_by_key(search_keys, SearchKey.EMAIL)
42
- if email_col:
43
- personal_cols.append(email_col)
44
- hem_col = _get_column_by_key(search_keys, SearchKey.HEM)
45
- if hem_col:
46
- personal_cols.append(hem_col)
47
- if len(personal_cols) == 0:
48
- return df
49
-
50
- sub_df = df[personal_cols + [date_col, TARGET]]
51
-
52
- # Fast check for duplicates by personal keys
53
- if not sub_df[personal_cols].duplicated().any():
54
- return df
55
-
56
- grouped_by_personal_cols = sub_df.groupby(personal_cols, group_keys=False)
57
-
58
- # counts of diff dates by set of personal keys
59
- uniques = grouped_by_personal_cols[date_col].nunique()
60
- total = len(uniques)
61
- diff_dates = len(uniques[uniques > 1])
62
- if diff_dates / total >= 0.6:
63
- return df
64
-
65
- # Additional checks
66
-
67
- duplicates = sub_df.duplicated(personal_cols, keep=False)
68
- duplicate_rows = sub_df[duplicates]
69
- if len(duplicate_rows) == 0:
70
- return df
71
-
72
- # if there is no different target values in personal keys duplicate rows
73
- nonunique_target_groups = grouped_by_personal_cols[TARGET].nunique() > 1
74
- if nonunique_target_groups.sum() == 0:
75
- return df
76
-
77
- def has_diff_target_within_60_days(rows):
78
- rows = rows.sort_values(by=date_col)
79
- return len(rows[rows[TARGET].ne(rows[TARGET].shift()) & (rows[date_col].diff() < 60 * 24 * 60 * 60 * 1000)]) > 0
80
-
81
- nonunique_target_rows = nonunique_target_groups[nonunique_target_groups].reset_index().drop(columns=TARGET)
82
- sub_df = pd.merge(sub_df, nonunique_target_rows, on=personal_cols)
83
-
84
- sub_df = DateTimeSearchKeyConverter(date_col, date_format=date_format, logger=logger, bundle=bundle).convert(sub_df)
85
- grouped_by_personal_cols = sub_df.groupby(personal_cols, group_keys=False)
86
- rows_with_diff_target = grouped_by_personal_cols.filter(has_diff_target_within_60_days)
87
- if len(rows_with_diff_target) > 0:
88
- unique_keys_to_delete = rows_with_diff_target[personal_cols].drop_duplicates()
89
- if EVAL_SET_INDEX not in df.columns:
90
- rows_to_remove = pd.merge(df.reset_index(), unique_keys_to_delete, on=personal_cols)
91
- rows_to_remove = rows_to_remove.set_index(df.index.name or "index")
92
- perc = len(rows_to_remove) * 100 / len(df)
93
- msg = bundle.get("dataset_train_diff_target_duplicates_fintech").format(
94
- perc, len(rows_to_remove), rows_to_remove.index.to_list()
95
- )
96
- if not silent:
97
- print(msg)
98
- if logger:
99
- logger.warning(msg)
100
- logger.info(f"Dataset shape before clean fintech duplicates: {df.shape}")
101
- df = df[~df.index.isin(rows_to_remove.index)]
102
- logger.info(f"Dataset shape after clean fintech duplicates: {df.shape}")
103
- else:
104
- # Indices in train and eval_set can be the same so we remove rows from them separately
105
- train = df.query(f"{EVAL_SET_INDEX} == 0")
106
- train_rows_to_remove = pd.merge(train.reset_index(), unique_keys_to_delete, on=personal_cols)
107
- train_rows_to_remove = train_rows_to_remove.set_index(train.index.name or "index")
108
- train_perc = len(train_rows_to_remove) * 100 / len(train)
109
- msg = bundle.get("dataset_train_diff_target_duplicates_fintech").format(
110
- train_perc, len(train_rows_to_remove), train_rows_to_remove.index.to_list()
111
- )
112
- if not silent:
113
- print(msg)
114
- if logger:
115
- logger.warning(msg)
116
- logger.info(f"Train dataset shape before clean fintech duplicates: {train.shape}")
117
- train = train[~train.index.isin(train_rows_to_remove.index)]
118
- logger.info(f"Train dataset shape after clean fintech duplicates: {train.shape}")
119
-
120
- evals = [df.query(f"{EVAL_SET_INDEX} == {i}") for i in df[EVAL_SET_INDEX].unique() if i != 0]
121
- new_evals = []
122
- for i, eval in enumerate(evals):
123
- eval_rows_to_remove = pd.merge(eval.reset_index(), unique_keys_to_delete, on=personal_cols)
124
- eval_rows_to_remove = eval_rows_to_remove.set_index(eval.index.name or "index")
125
- eval_perc = len(eval_rows_to_remove) * 100 / len(eval)
126
- msg = bundle.get("dataset_eval_diff_target_duplicates_fintech").format(
127
- eval_perc, len(eval_rows_to_remove), i + 1, eval_rows_to_remove.index.to_list()
128
- )
129
- if not silent:
130
- print(msg)
131
- if logger:
132
- logger.warning(msg)
133
- logger.info(f"Eval {i + 1} dataset shape before clean fintech duplicates: {eval.shape}")
134
- eval = eval[~eval.index.isin(eval_rows_to_remove.index)]
135
- logger.info(f"Eval {i + 1} dataset shape after clean fintech duplicates: {eval.shape}")
136
- new_evals.append(eval)
137
-
138
- logger.info(f"Dataset shape before clean fintech duplicates: {df.shape}")
139
- df = pd.concat([train] + new_evals)
140
- logger.info(f"Dataset shape after clean fintech duplicates: {df.shape}")
141
- return df
142
-
143
-
144
- def clean_full_duplicates(
145
- df: pd.DataFrame, logger: Optional[Logger] = None, silent=False, bundle: ResourceBundle = None
146
- ) -> pd.DataFrame:
147
- nrows = len(df)
148
- if nrows == 0:
149
- return df
150
- # Remove full duplicates (exclude system_record_id, sort_id and eval_set_index)
151
- unique_columns = df.columns.tolist()
152
- if SYSTEM_RECORD_ID in unique_columns:
153
- unique_columns.remove(SYSTEM_RECORD_ID)
154
- if ENTITY_SYSTEM_RECORD_ID in unique_columns:
155
- unique_columns.remove(ENTITY_SYSTEM_RECORD_ID)
156
- if SORT_ID in unique_columns:
157
- unique_columns.remove(SORT_ID)
158
- if EVAL_SET_INDEX in unique_columns:
159
- unique_columns.remove(EVAL_SET_INDEX)
160
- logger.info(f"Dataset shape before clean duplicates: {df.shape}")
161
- # Train segment goes first so if duplicates are found in train and eval set
162
- # then we keep unique rows in train segment
163
- df = df.drop_duplicates(subset=unique_columns, keep="first")
164
- logger.info(f"Dataset shape after clean duplicates: {df.shape}")
165
- nrows_after_full_dedup = len(df)
166
- share_full_dedup = 100 * (1 - nrows_after_full_dedup / nrows)
167
- if share_full_dedup > 0:
168
- msg = bundle.get("dataset_full_duplicates").format(share_full_dedup)
169
- logger.warning(msg)
170
- # if not silent_mode:
171
- # print(msg)
172
- # self.warning_counter.increment()
173
- if TARGET in df.columns:
174
- unique_columns.remove(TARGET)
175
- marked_duplicates = df.duplicated(subset=unique_columns, keep=False)
176
- if marked_duplicates.sum() > 0:
177
- dups_indices = df[marked_duplicates].index.to_list()
178
- nrows_after_tgt_dedup = len(df.drop_duplicates(subset=unique_columns, keep=False))
179
- num_dup_rows = nrows_after_full_dedup - nrows_after_tgt_dedup
180
- share_tgt_dedup = 100 * num_dup_rows / nrows_after_full_dedup
181
-
182
- msg = bundle.get("dataset_diff_target_duplicates").format(share_tgt_dedup, num_dup_rows, dups_indices)
183
- logger.warning(msg)
184
- if not silent:
185
- print(msg)
186
- df = df.drop_duplicates(subset=unique_columns, keep=False)
187
- logger.info(f"Dataset shape after clean invalid target duplicates: {df.shape}")
188
-
189
- return df
190
-
191
-
192
- def _get_column_by_key(search_keys: Dict[str, SearchKey], keys: Union[SearchKey, List[SearchKey]]) -> Optional[str]:
193
- for col, key_type in search_keys.items():
194
- if (isinstance(keys, list) and key_type in keys) or key_type == keys:
195
- return col
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes