upgini 1.2.11__tar.gz → 1.2.13__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

Files changed (66) hide show
  1. {upgini-1.2.11 → upgini-1.2.13}/PKG-INFO +1 -1
  2. upgini-1.2.13/src/upgini/__about__.py +1 -0
  3. {upgini-1.2.11 → upgini-1.2.13}/src/upgini/autofe/binary.py +4 -2
  4. {upgini-1.2.11 → upgini-1.2.13}/src/upgini/features_enricher.py +1 -0
  5. {upgini-1.2.11 → upgini-1.2.13}/src/upgini/metrics.py +1 -1
  6. {upgini-1.2.11 → upgini-1.2.13}/src/upgini/resource_bundle/strings.properties +2 -2
  7. upgini-1.2.13/src/upgini/utils/deduplicate_utils.py +200 -0
  8. {upgini-1.2.11 → upgini-1.2.13}/src/upgini/utils/features_validator.py +11 -11
  9. upgini-1.2.11/src/upgini/__about__.py +0 -1
  10. upgini-1.2.11/src/upgini/utils/deduplicate_utils.py +0 -195
  11. {upgini-1.2.11 → upgini-1.2.13}/.gitignore +0 -0
  12. {upgini-1.2.11 → upgini-1.2.13}/LICENSE +0 -0
  13. {upgini-1.2.11 → upgini-1.2.13}/README.md +0 -0
  14. {upgini-1.2.11 → upgini-1.2.13}/pyproject.toml +0 -0
  15. {upgini-1.2.11 → upgini-1.2.13}/src/upgini/__init__.py +0 -0
  16. {upgini-1.2.11 → upgini-1.2.13}/src/upgini/ads.py +0 -0
  17. {upgini-1.2.11 → upgini-1.2.13}/src/upgini/ads_management/__init__.py +0 -0
  18. {upgini-1.2.11 → upgini-1.2.13}/src/upgini/ads_management/ads_manager.py +0 -0
  19. {upgini-1.2.11 → upgini-1.2.13}/src/upgini/autofe/__init__.py +0 -0
  20. {upgini-1.2.11 → upgini-1.2.13}/src/upgini/autofe/all_operands.py +0 -0
  21. {upgini-1.2.11 → upgini-1.2.13}/src/upgini/autofe/date.py +0 -0
  22. {upgini-1.2.11 → upgini-1.2.13}/src/upgini/autofe/feature.py +0 -0
  23. {upgini-1.2.11 → upgini-1.2.13}/src/upgini/autofe/groupby.py +0 -0
  24. {upgini-1.2.11 → upgini-1.2.13}/src/upgini/autofe/operand.py +0 -0
  25. {upgini-1.2.11 → upgini-1.2.13}/src/upgini/autofe/unary.py +0 -0
  26. {upgini-1.2.11 → upgini-1.2.13}/src/upgini/autofe/vector.py +0 -0
  27. {upgini-1.2.11 → upgini-1.2.13}/src/upgini/data_source/__init__.py +0 -0
  28. {upgini-1.2.11 → upgini-1.2.13}/src/upgini/data_source/data_source_publisher.py +0 -0
  29. {upgini-1.2.11 → upgini-1.2.13}/src/upgini/dataset.py +0 -0
  30. {upgini-1.2.11 → upgini-1.2.13}/src/upgini/errors.py +0 -0
  31. {upgini-1.2.11 → upgini-1.2.13}/src/upgini/http.py +0 -0
  32. {upgini-1.2.11 → upgini-1.2.13}/src/upgini/lazy_import.py +0 -0
  33. {upgini-1.2.11 → upgini-1.2.13}/src/upgini/mdc/__init__.py +0 -0
  34. {upgini-1.2.11 → upgini-1.2.13}/src/upgini/mdc/context.py +0 -0
  35. {upgini-1.2.11 → upgini-1.2.13}/src/upgini/metadata.py +0 -0
  36. {upgini-1.2.11 → upgini-1.2.13}/src/upgini/normalizer/__init__.py +0 -0
  37. {upgini-1.2.11 → upgini-1.2.13}/src/upgini/normalizer/normalize_utils.py +0 -0
  38. {upgini-1.2.11 → upgini-1.2.13}/src/upgini/resource_bundle/__init__.py +0 -0
  39. {upgini-1.2.11 → upgini-1.2.13}/src/upgini/resource_bundle/exceptions.py +0 -0
  40. {upgini-1.2.11 → upgini-1.2.13}/src/upgini/resource_bundle/strings_widget.properties +0 -0
  41. {upgini-1.2.11 → upgini-1.2.13}/src/upgini/sampler/__init__.py +0 -0
  42. {upgini-1.2.11 → upgini-1.2.13}/src/upgini/sampler/base.py +0 -0
  43. {upgini-1.2.11 → upgini-1.2.13}/src/upgini/sampler/random_under_sampler.py +0 -0
  44. {upgini-1.2.11 → upgini-1.2.13}/src/upgini/sampler/utils.py +0 -0
  45. {upgini-1.2.11 → upgini-1.2.13}/src/upgini/search_task.py +0 -0
  46. {upgini-1.2.11 → upgini-1.2.13}/src/upgini/spinner.py +0 -0
  47. {upgini-1.2.11 → upgini-1.2.13}/src/upgini/utils/__init__.py +0 -0
  48. {upgini-1.2.11 → upgini-1.2.13}/src/upgini/utils/base_search_key_detector.py +0 -0
  49. {upgini-1.2.11 → upgini-1.2.13}/src/upgini/utils/blocked_time_series.py +0 -0
  50. {upgini-1.2.11 → upgini-1.2.13}/src/upgini/utils/country_utils.py +0 -0
  51. {upgini-1.2.11 → upgini-1.2.13}/src/upgini/utils/custom_loss_utils.py +0 -0
  52. {upgini-1.2.11 → upgini-1.2.13}/src/upgini/utils/cv_utils.py +0 -0
  53. {upgini-1.2.11 → upgini-1.2.13}/src/upgini/utils/datetime_utils.py +0 -0
  54. {upgini-1.2.11 → upgini-1.2.13}/src/upgini/utils/display_utils.py +0 -0
  55. {upgini-1.2.11 → upgini-1.2.13}/src/upgini/utils/email_utils.py +0 -0
  56. {upgini-1.2.11 → upgini-1.2.13}/src/upgini/utils/fallback_progress_bar.py +0 -0
  57. {upgini-1.2.11 → upgini-1.2.13}/src/upgini/utils/format.py +0 -0
  58. {upgini-1.2.11 → upgini-1.2.13}/src/upgini/utils/ip_utils.py +0 -0
  59. {upgini-1.2.11 → upgini-1.2.13}/src/upgini/utils/phone_utils.py +0 -0
  60. {upgini-1.2.11 → upgini-1.2.13}/src/upgini/utils/postal_code_utils.py +0 -0
  61. {upgini-1.2.11 → upgini-1.2.13}/src/upgini/utils/progress_bar.py +0 -0
  62. {upgini-1.2.11 → upgini-1.2.13}/src/upgini/utils/sklearn_ext.py +0 -0
  63. {upgini-1.2.11 → upgini-1.2.13}/src/upgini/utils/target_utils.py +0 -0
  64. {upgini-1.2.11 → upgini-1.2.13}/src/upgini/utils/track_info.py +0 -0
  65. {upgini-1.2.11 → upgini-1.2.13}/src/upgini/utils/warning_counter.py +0 -0
  66. {upgini-1.2.11 → upgini-1.2.13}/src/upgini/version_validator.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: upgini
3
- Version: 1.2.11
3
+ Version: 1.2.13
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -0,0 +1 @@
1
+ __version__ = "1.2.13"
@@ -142,9 +142,9 @@ class Distance(PandasOperand):
142
142
  def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
143
143
  return pd.Series(
144
144
  1 - self.__dot(left, right) / (self.__norm(left) * self.__norm(right)), index=left.index
145
- )
145
+ ).astype(np.float64)
146
146
 
147
- # row-wise dot product
147
+ # row-wise dot product, handling None values
148
148
  def __dot(self, left: pd.Series, right: pd.Series) -> pd.Series:
149
149
  left = left.apply(lambda x: np.array(x))
150
150
  right = right.apply(lambda x: np.array(x))
@@ -152,7 +152,9 @@ class Distance(PandasOperand):
152
152
  res = res.reindex(left.index.union(right.index))
153
153
  return res
154
154
 
155
+ # Calculate the norm of a vector, handling None values
155
156
  def __norm(self, vector: pd.Series) -> pd.Series:
157
+ vector = vector.fillna(np.nan)
156
158
  return np.sqrt(self.__dot(vector, vector))
157
159
 
158
160
 
@@ -3322,6 +3322,7 @@ class FeaturesEnricher(TransformerMixin):
3322
3322
  # index overrites from result_features
3323
3323
  original_index_name = df_with_original_index.index.name
3324
3324
  df_with_original_index = df_with_original_index.reset_index()
3325
+ # TODO drop system_record_id before merge
3325
3326
  result_features = pd.merge(
3326
3327
  df_with_original_index,
3327
3328
  result_features,
@@ -526,7 +526,7 @@ class CatBoostWrapper(EstimatorWrapper):
526
526
  emb_name = "__grouped_embeddings"
527
527
  df = df.copy()
528
528
  df[self.emb_features] = df[self.emb_features].fillna(0.0)
529
- df[emb_name] = df[self.emb_features].values.tolist()
529
+ df[emb_name] = pd.Series(df[self.emb_features].values.tolist())
530
530
  df = df.drop(columns=self.emb_features)
531
531
 
532
532
  return df, [emb_name]
@@ -22,7 +22,7 @@ slack_community_bage=https://img.shields.io/badge/slack-@upgini-orange.svg?logo=
22
22
  slack_community_alt=Upgini Slack community
23
23
  version_warning=\nWARNING: Unsupported library version detected {},\nplease update with “%pip install -U upgini” to the latest {} and restart Jupyter kernel
24
24
  unregistered_with_personal_keys=\nWARNING: Search key {} can be used only with personal api_key from profile.upgini.com It will be ignored
25
- date_only_search=\nWARNING: Search started with DATE search key only\nTry to add other keys like the COUNTRY, POSTAL_CODE, PHONE NUMBER, EMAIL/HEM, IPv4 to your training dataset\nfor search through all the available data sources.\nSee docs https://github.com/upgini/upgini#-total-239-countries-and-up-to-41-years-of-history
25
+ date_only_search=\nWARNING: Search started with DATE search key only\nTry to add other keys like the COUNTRY, POSTAL_CODE, PHONE NUMBER, EMAIL/HEM, IP to your training dataset\nfor search through all the available data sources.\nSee docs https://github.com/upgini/upgini#-total-239-countries-and-up-to-41-years-of-history
26
26
  date_search_without_time_series=\nWARNING: Looks like your training dataset is a time series. We recommend to set `cv=CVType.time_series` param for correct search results.\nSee docs https://github.com/upgini/upgini#-time-series-prediction-support
27
27
  metrics_exclude_paid_features=\nWARNING: Metrics calculated after enrichment has a free features only. To calculate metrics with a full set of relevant features, including commercial data sources, please contact support team:
28
28
  metrics_no_important_free_features=\nWARNING: No important free features to calculate metrics
@@ -96,7 +96,7 @@ invalid_date_format=Failed to parse date in column `{}`. Try to pass explicit da
96
96
  unsupported_date_type=Unsupported type of date column `{}`. Convert to datetime please.
97
97
  invalid_postal_code=All values of POSTAL_CODE column `{}` are invalid
98
98
  invalid_country=All values of COUNTRY column `{}` are invalid
99
- invalid_ip=All values of IPv4 column `{}` are invalid
99
+ invalid_ip=All values of IP column `{}` are invalid
100
100
  # X and y validation
101
101
  unsupported_x_type=Unsupported type of X: {}. Use pandas.DataFrame, pandas.Series or numpy.ndarray or list
102
102
  x_contains_dup_columns=X contains duplicate column names. Please rename or drop duplicates
@@ -0,0 +1,200 @@
1
+ from logging import Logger
2
+ from typing import Dict, List, Optional, Union
3
+
4
+ import pandas as pd
5
+
6
+ from upgini.metadata import (
7
+ ENTITY_SYSTEM_RECORD_ID,
8
+ EVAL_SET_INDEX,
9
+ SORT_ID,
10
+ SYSTEM_RECORD_ID,
11
+ TARGET,
12
+ ModelTaskType,
13
+ SearchKey,
14
+ )
15
+ from upgini.resource_bundle import ResourceBundle
16
+ from upgini.utils.datetime_utils import DateTimeSearchKeyConverter
17
+ from upgini.utils.target_utils import define_task
18
+
19
+
20
+ def remove_fintech_duplicates(
21
+ df: pd.DataFrame,
22
+ search_keys: Dict[str, SearchKey],
23
+ date_format: Optional[str] = None,
24
+ logger: Optional[Logger] = None,
25
+ silent=False,
26
+ bundle: ResourceBundle = None,
27
+ ) -> pd.DataFrame:
28
+ # Initial checks for target type and date column
29
+ date_col = _get_column_by_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
30
+ if define_task(df[TARGET], date_col is not None, silent=True) != ModelTaskType.BINARY:
31
+ return df
32
+
33
+ if date_col is None:
34
+ return df
35
+
36
+ personal_cols = []
37
+ phone_col = _get_column_by_key(search_keys, SearchKey.PHONE)
38
+ if phone_col:
39
+ personal_cols.append(phone_col)
40
+ email_col = _get_column_by_key(search_keys, SearchKey.EMAIL)
41
+ if email_col:
42
+ personal_cols.append(email_col)
43
+ hem_col = _get_column_by_key(search_keys, SearchKey.HEM)
44
+ if hem_col:
45
+ personal_cols.append(hem_col)
46
+ if len(personal_cols) == 0:
47
+ return df
48
+
49
+ # Splitting into train and eval_set parts
50
+ if EVAL_SET_INDEX in df.columns:
51
+ train_df = df[df[EVAL_SET_INDEX] == 0]
52
+ eval_dfs = [df[df[EVAL_SET_INDEX] == idx] for idx in df[EVAL_SET_INDEX].unique() if idx != 0]
53
+ else:
54
+ train_df = df
55
+ eval_dfs = []
56
+
57
+ def process_df(segment_df: pd.DataFrame, eval_index=0) -> pd.DataFrame:
58
+ """Process a subset of the dataset to remove duplicates based on personal keys."""
59
+ # Fast check for duplicates based on personal keys
60
+ if not segment_df[personal_cols].duplicated().any():
61
+ return segment_df
62
+
63
+ sub_df = segment_df[personal_cols + [date_col, TARGET]].copy()
64
+
65
+ # Group by personal columns to check for unique dates
66
+ grouped_by_personal_cols = sub_df.groupby(personal_cols, group_keys=False)
67
+
68
+ # Checking for different dates by the same personal keys
69
+ uniques = grouped_by_personal_cols[date_col].nunique()
70
+ total = len(uniques)
71
+ diff_dates = len(uniques[uniques > 1])
72
+ if diff_dates / total >= 0.6:
73
+ return segment_df
74
+
75
+ # Check for duplicate rows
76
+ duplicates = sub_df.duplicated(personal_cols, keep=False)
77
+ duplicate_rows = sub_df[duplicates]
78
+ if len(duplicate_rows) == 0:
79
+ return segment_df
80
+
81
+ # Check if there are different target values for the same personal keys
82
+ nonunique_target_groups = grouped_by_personal_cols[TARGET].nunique() > 1
83
+ if nonunique_target_groups.sum() == 0:
84
+ return segment_df
85
+
86
+ # Helper function to check if there are different target values within 60 days
87
+ def has_diff_target_within_60_days(rows: pd.DataFrame):
88
+ rows = rows.sort_values(by=date_col)
89
+ return (
90
+ len(rows[rows[TARGET].ne(rows[TARGET].shift()) & (rows[date_col].diff() < 60 * 24 * 60 * 60 * 1000)])
91
+ > 0
92
+ )
93
+
94
+ # Filter rows with different target values within 60 days
95
+ nonunique_target_rows = nonunique_target_groups[nonunique_target_groups].reset_index().drop(columns=TARGET)
96
+ sub_df = pd.merge(sub_df, nonunique_target_rows, on=personal_cols)
97
+
98
+ # Convert date columns for further checks
99
+ sub_df = DateTimeSearchKeyConverter(date_col, date_format=date_format, logger=logger, bundle=bundle).convert(
100
+ sub_df
101
+ )
102
+ grouped_by_personal_cols = sub_df.groupby(personal_cols, group_keys=False)
103
+ rows_with_diff_target = grouped_by_personal_cols.filter(has_diff_target_within_60_days)
104
+
105
+ if len(rows_with_diff_target) > 0:
106
+ unique_keys_to_delete = rows_with_diff_target[personal_cols].drop_duplicates()
107
+ rows_to_remove = pd.merge(segment_df.reset_index(), unique_keys_to_delete, on=personal_cols)
108
+ rows_to_remove = rows_to_remove.set_index(segment_df.index.name or "index")
109
+ perc = len(rows_to_remove) * 100 / len(segment_df)
110
+ if eval_index == 0:
111
+ msg = bundle.get("dataset_train_diff_target_duplicates_fintech").format(
112
+ perc, len(rows_to_remove), rows_to_remove.index.to_list()
113
+ )
114
+ else:
115
+ msg = bundle.get("dataset_eval_diff_target_duplicates_fintech").format(
116
+ perc, len(rows_to_remove), eval_index, rows_to_remove.index.to_list()
117
+ )
118
+ if not silent:
119
+ print(msg)
120
+ if logger:
121
+ logger.warning(msg)
122
+ return segment_df[~segment_df.index.isin(rows_to_remove.index)]
123
+ return segment_df
124
+
125
+ # Process the train part separately
126
+ logger.info(f"Train dataset shape before clean fintech duplicates: {train_df.shape}")
127
+ train_df = process_df(train_df)
128
+ logger.info(f"Train dataset shape after clean fintech duplicates: {train_df.shape}")
129
+
130
+ # Process each eval_set part separately
131
+ new_eval_dfs = []
132
+ for i, eval_df in enumerate(eval_dfs, 1):
133
+ logger.info(f"Eval {i} dataset shape before clean fintech duplicates: {eval_df.shape}")
134
+ cleaned_eval_df = process_df(eval_df, i)
135
+ logger.info(f"Eval {i} dataset shape after clean fintech duplicates: {cleaned_eval_df.shape}")
136
+ new_eval_dfs.append(cleaned_eval_df)
137
+
138
+ # Combine the processed train and eval parts back into one dataset
139
+ logger.info(f"Dataset shape before clean fintech duplicates: {df.shape}")
140
+ if new_eval_dfs:
141
+ df = pd.concat([train_df] + new_eval_dfs)
142
+ else:
143
+ df = train_df
144
+ logger.info(f"Dataset shape after clean fintech duplicates: {df.shape}")
145
+
146
+ return df
147
+
148
+
149
+ def clean_full_duplicates(
150
+ df: pd.DataFrame, logger: Optional[Logger] = None, silent=False, bundle: ResourceBundle = None
151
+ ) -> pd.DataFrame:
152
+ nrows = len(df)
153
+ if nrows == 0:
154
+ return df
155
+ # Remove full duplicates (exclude system_record_id, sort_id and eval_set_index)
156
+ unique_columns = df.columns.tolist()
157
+ if SYSTEM_RECORD_ID in unique_columns:
158
+ unique_columns.remove(SYSTEM_RECORD_ID)
159
+ if ENTITY_SYSTEM_RECORD_ID in unique_columns:
160
+ unique_columns.remove(ENTITY_SYSTEM_RECORD_ID)
161
+ if SORT_ID in unique_columns:
162
+ unique_columns.remove(SORT_ID)
163
+ if EVAL_SET_INDEX in unique_columns:
164
+ unique_columns.remove(EVAL_SET_INDEX)
165
+ logger.info(f"Dataset shape before clean duplicates: {df.shape}")
166
+ # Train segment goes first so if duplicates are found in train and eval set
167
+ # then we keep unique rows in train segment
168
+ df = df.drop_duplicates(subset=unique_columns, keep="first")
169
+ logger.info(f"Dataset shape after clean duplicates: {df.shape}")
170
+ nrows_after_full_dedup = len(df)
171
+ share_full_dedup = 100 * (1 - nrows_after_full_dedup / nrows)
172
+ if share_full_dedup > 0:
173
+ msg = bundle.get("dataset_full_duplicates").format(share_full_dedup)
174
+ logger.warning(msg)
175
+ # if not silent_mode:
176
+ # print(msg)
177
+ # self.warning_counter.increment()
178
+ if TARGET in df.columns:
179
+ unique_columns.remove(TARGET)
180
+ marked_duplicates = df.duplicated(subset=unique_columns, keep=False)
181
+ if marked_duplicates.sum() > 0:
182
+ dups_indices = df[marked_duplicates].index.to_list()
183
+ nrows_after_tgt_dedup = len(df.drop_duplicates(subset=unique_columns, keep=False))
184
+ num_dup_rows = nrows_after_full_dedup - nrows_after_tgt_dedup
185
+ share_tgt_dedup = 100 * num_dup_rows / nrows_after_full_dedup
186
+
187
+ msg = bundle.get("dataset_diff_target_duplicates").format(share_tgt_dedup, num_dup_rows, dups_indices)
188
+ logger.warning(msg)
189
+ if not silent:
190
+ print(msg)
191
+ df = df.drop_duplicates(subset=unique_columns, keep=False)
192
+ logger.info(f"Dataset shape after clean invalid target duplicates: {df.shape}")
193
+
194
+ return df
195
+
196
+
197
+ def _get_column_by_key(search_keys: Dict[str, SearchKey], keys: Union[SearchKey, List[SearchKey]]) -> Optional[str]:
198
+ for col, key_type in search_keys.items():
199
+ if (isinstance(keys, list) and key_type in keys) or key_type == keys:
200
+ return col
@@ -56,25 +56,25 @@ class FeaturesValidator:
56
56
  # self.logger.warning(msg)
57
57
  # warning_counter.increment()
58
58
 
59
+ columns_renaming = columns_renaming or {}
60
+
59
61
  if empty_or_constant_features:
60
- if columns_renaming:
61
- display_names = [columns_renaming.get(f, f) for f in empty_or_constant_features]
62
- else:
63
- display_names = empty_or_constant_features
64
- msg = bundle.get("empty_or_contant_features").format(display_names)
62
+ msg = bundle.get("empty_or_contant_features").format(
63
+ [columns_renaming.get(f, f) for f in empty_or_constant_features]
64
+ )
65
65
  print(msg)
66
66
  self.logger.warning(msg)
67
67
  warning_counter.increment()
68
68
 
69
69
  high_cardinality_features = self.find_high_cardinality(df[features])
70
70
  if features_for_generate:
71
- high_cardinality_features = [f for f in high_cardinality_features if f not in features_for_generate]
71
+ high_cardinality_features = [
72
+ f for f in high_cardinality_features if columns_renaming.get(f, f) not in features_for_generate
73
+ ]
72
74
  if high_cardinality_features:
73
- if columns_renaming:
74
- display_names = [columns_renaming.get(f, f) for f in high_cardinality_features]
75
- else:
76
- display_names = empty_or_constant_features
77
- msg = bundle.get("high_cardinality_features").format(display_names)
75
+ msg = bundle.get("high_cardinality_features").format(
76
+ [columns_renaming.get(f, f) for f in high_cardinality_features]
77
+ )
78
78
  print(msg)
79
79
  self.logger.warning(msg)
80
80
  warning_counter.increment()
@@ -1 +0,0 @@
1
- __version__ = "1.2.11"
@@ -1,195 +0,0 @@
1
- from logging import Logger
2
- from typing import Dict, List, Optional, Union
3
-
4
- import pandas as pd
5
-
6
- from upgini.metadata import (
7
- ENTITY_SYSTEM_RECORD_ID,
8
- EVAL_SET_INDEX,
9
- SORT_ID,
10
- SYSTEM_RECORD_ID,
11
- TARGET,
12
- ModelTaskType,
13
- SearchKey,
14
- )
15
- from upgini.resource_bundle import ResourceBundle
16
- from upgini.utils.datetime_utils import DateTimeSearchKeyConverter
17
- from upgini.utils.target_utils import define_task
18
-
19
-
20
- def remove_fintech_duplicates(
21
- df: pd.DataFrame,
22
- search_keys: Dict[str, SearchKey],
23
- date_format: Optional[str] = None,
24
- logger: Optional[Logger] = None,
25
- silent=False,
26
- bundle: ResourceBundle = None,
27
- ) -> pd.DataFrame:
28
- # Base checks
29
- date_col = _get_column_by_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
30
- if define_task(df[TARGET], date_col is not None, silent=True) != ModelTaskType.BINARY:
31
- return df
32
-
33
- date_col = _get_column_by_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
34
- if date_col is None:
35
- return df
36
-
37
- personal_cols = []
38
- phone_col = _get_column_by_key(search_keys, SearchKey.PHONE)
39
- if phone_col:
40
- personal_cols.append(phone_col)
41
- email_col = _get_column_by_key(search_keys, SearchKey.EMAIL)
42
- if email_col:
43
- personal_cols.append(email_col)
44
- hem_col = _get_column_by_key(search_keys, SearchKey.HEM)
45
- if hem_col:
46
- personal_cols.append(hem_col)
47
- if len(personal_cols) == 0:
48
- return df
49
-
50
- sub_df = df[personal_cols + [date_col, TARGET]]
51
-
52
- # Fast check for duplicates by personal keys
53
- if not sub_df[personal_cols].duplicated().any():
54
- return df
55
-
56
- grouped_by_personal_cols = sub_df.groupby(personal_cols, group_keys=False)
57
-
58
- # counts of diff dates by set of personal keys
59
- uniques = grouped_by_personal_cols[date_col].nunique()
60
- total = len(uniques)
61
- diff_dates = len(uniques[uniques > 1])
62
- if diff_dates / total >= 0.6:
63
- return df
64
-
65
- # Additional checks
66
-
67
- duplicates = sub_df.duplicated(personal_cols, keep=False)
68
- duplicate_rows = sub_df[duplicates]
69
- if len(duplicate_rows) == 0:
70
- return df
71
-
72
- # if there is no different target values in personal keys duplicate rows
73
- nonunique_target_groups = grouped_by_personal_cols[TARGET].nunique() > 1
74
- if nonunique_target_groups.sum() == 0:
75
- return df
76
-
77
- def has_diff_target_within_60_days(rows):
78
- rows = rows.sort_values(by=date_col)
79
- return len(rows[rows[TARGET].ne(rows[TARGET].shift()) & (rows[date_col].diff() < 60 * 24 * 60 * 60 * 1000)]) > 0
80
-
81
- nonunique_target_rows = nonunique_target_groups[nonunique_target_groups].reset_index().drop(columns=TARGET)
82
- sub_df = pd.merge(sub_df, nonunique_target_rows, on=personal_cols)
83
-
84
- sub_df = DateTimeSearchKeyConverter(date_col, date_format=date_format, logger=logger, bundle=bundle).convert(sub_df)
85
- grouped_by_personal_cols = sub_df.groupby(personal_cols, group_keys=False)
86
- rows_with_diff_target = grouped_by_personal_cols.filter(has_diff_target_within_60_days)
87
- if len(rows_with_diff_target) > 0:
88
- unique_keys_to_delete = rows_with_diff_target[personal_cols].drop_duplicates()
89
- if EVAL_SET_INDEX not in df.columns:
90
- rows_to_remove = pd.merge(df.reset_index(), unique_keys_to_delete, on=personal_cols)
91
- rows_to_remove = rows_to_remove.set_index(df.index.name or "index")
92
- perc = len(rows_to_remove) * 100 / len(df)
93
- msg = bundle.get("dataset_train_diff_target_duplicates_fintech").format(
94
- perc, len(rows_to_remove), rows_to_remove.index.to_list()
95
- )
96
- if not silent:
97
- print(msg)
98
- if logger:
99
- logger.warning(msg)
100
- logger.info(f"Dataset shape before clean fintech duplicates: {df.shape}")
101
- df = df[~df.index.isin(rows_to_remove.index)]
102
- logger.info(f"Dataset shape after clean fintech duplicates: {df.shape}")
103
- else:
104
- # Indices in train and eval_set can be the same so we remove rows from them separately
105
- train = df.query(f"{EVAL_SET_INDEX} == 0")
106
- train_rows_to_remove = pd.merge(train.reset_index(), unique_keys_to_delete, on=personal_cols)
107
- train_rows_to_remove = train_rows_to_remove.set_index(train.index.name or "index")
108
- train_perc = len(train_rows_to_remove) * 100 / len(train)
109
- msg = bundle.get("dataset_train_diff_target_duplicates_fintech").format(
110
- train_perc, len(train_rows_to_remove), train_rows_to_remove.index.to_list()
111
- )
112
- if not silent:
113
- print(msg)
114
- if logger:
115
- logger.warning(msg)
116
- logger.info(f"Train dataset shape before clean fintech duplicates: {train.shape}")
117
- train = train[~train.index.isin(train_rows_to_remove.index)]
118
- logger.info(f"Train dataset shape after clean fintech duplicates: {train.shape}")
119
-
120
- evals = [df.query(f"{EVAL_SET_INDEX} == {i}") for i in df[EVAL_SET_INDEX].unique() if i != 0]
121
- new_evals = []
122
- for i, eval in enumerate(evals):
123
- eval_rows_to_remove = pd.merge(eval.reset_index(), unique_keys_to_delete, on=personal_cols)
124
- eval_rows_to_remove = eval_rows_to_remove.set_index(eval.index.name or "index")
125
- eval_perc = len(eval_rows_to_remove) * 100 / len(eval)
126
- msg = bundle.get("dataset_eval_diff_target_duplicates_fintech").format(
127
- eval_perc, len(eval_rows_to_remove), i + 1, eval_rows_to_remove.index.to_list()
128
- )
129
- if not silent:
130
- print(msg)
131
- if logger:
132
- logger.warning(msg)
133
- logger.info(f"Eval {i + 1} dataset shape before clean fintech duplicates: {eval.shape}")
134
- eval = eval[~eval.index.isin(eval_rows_to_remove.index)]
135
- logger.info(f"Eval {i + 1} dataset shape after clean fintech duplicates: {eval.shape}")
136
- new_evals.append(eval)
137
-
138
- logger.info(f"Dataset shape before clean fintech duplicates: {df.shape}")
139
- df = pd.concat([train] + new_evals)
140
- logger.info(f"Dataset shape after clean fintech duplicates: {df.shape}")
141
- return df
142
-
143
-
144
- def clean_full_duplicates(
145
- df: pd.DataFrame, logger: Optional[Logger] = None, silent=False, bundle: ResourceBundle = None
146
- ) -> pd.DataFrame:
147
- nrows = len(df)
148
- if nrows == 0:
149
- return df
150
- # Remove full duplicates (exclude system_record_id, sort_id and eval_set_index)
151
- unique_columns = df.columns.tolist()
152
- if SYSTEM_RECORD_ID in unique_columns:
153
- unique_columns.remove(SYSTEM_RECORD_ID)
154
- if ENTITY_SYSTEM_RECORD_ID in unique_columns:
155
- unique_columns.remove(ENTITY_SYSTEM_RECORD_ID)
156
- if SORT_ID in unique_columns:
157
- unique_columns.remove(SORT_ID)
158
- if EVAL_SET_INDEX in unique_columns:
159
- unique_columns.remove(EVAL_SET_INDEX)
160
- logger.info(f"Dataset shape before clean duplicates: {df.shape}")
161
- # Train segment goes first so if duplicates are found in train and eval set
162
- # then we keep unique rows in train segment
163
- df = df.drop_duplicates(subset=unique_columns, keep="first")
164
- logger.info(f"Dataset shape after clean duplicates: {df.shape}")
165
- nrows_after_full_dedup = len(df)
166
- share_full_dedup = 100 * (1 - nrows_after_full_dedup / nrows)
167
- if share_full_dedup > 0:
168
- msg = bundle.get("dataset_full_duplicates").format(share_full_dedup)
169
- logger.warning(msg)
170
- # if not silent_mode:
171
- # print(msg)
172
- # self.warning_counter.increment()
173
- if TARGET in df.columns:
174
- unique_columns.remove(TARGET)
175
- marked_duplicates = df.duplicated(subset=unique_columns, keep=False)
176
- if marked_duplicates.sum() > 0:
177
- dups_indices = df[marked_duplicates].index.to_list()
178
- nrows_after_tgt_dedup = len(df.drop_duplicates(subset=unique_columns, keep=False))
179
- num_dup_rows = nrows_after_full_dedup - nrows_after_tgt_dedup
180
- share_tgt_dedup = 100 * num_dup_rows / nrows_after_full_dedup
181
-
182
- msg = bundle.get("dataset_diff_target_duplicates").format(share_tgt_dedup, num_dup_rows, dups_indices)
183
- logger.warning(msg)
184
- if not silent:
185
- print(msg)
186
- df = df.drop_duplicates(subset=unique_columns, keep=False)
187
- logger.info(f"Dataset shape after clean invalid target duplicates: {df.shape}")
188
-
189
- return df
190
-
191
-
192
- def _get_column_by_key(search_keys: Dict[str, SearchKey], keys: Union[SearchKey, List[SearchKey]]) -> Optional[str]:
193
- for col, key_type in search_keys.items():
194
- if (isinstance(keys, list) and key_type in keys) or key_type == keys:
195
- return col
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes