upgini 1.2.12__py3-none-any.whl → 1.2.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

upgini/__about__.py CHANGED
@@ -1 +1 @@
1
- __version__ = "1.2.12"
1
+ __version__ = "1.2.13"
upgini/autofe/binary.py CHANGED
@@ -142,9 +142,9 @@ class Distance(PandasOperand):
142
142
  def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
143
143
  return pd.Series(
144
144
  1 - self.__dot(left, right) / (self.__norm(left) * self.__norm(right)), index=left.index
145
- )
145
+ ).astype(np.float64)
146
146
 
147
- # row-wise dot product
147
+ # row-wise dot product, handling None values
148
148
  def __dot(self, left: pd.Series, right: pd.Series) -> pd.Series:
149
149
  left = left.apply(lambda x: np.array(x))
150
150
  right = right.apply(lambda x: np.array(x))
@@ -152,7 +152,9 @@ class Distance(PandasOperand):
152
152
  res = res.reindex(left.index.union(right.index))
153
153
  return res
154
154
 
155
+ # Calculate the norm of a vector, handling None values
155
156
  def __norm(self, vector: pd.Series) -> pd.Series:
157
+ vector = vector.fillna(np.nan)
156
158
  return np.sqrt(self.__dot(vector, vector))
157
159
 
158
160
 
@@ -3322,6 +3322,7 @@ class FeaturesEnricher(TransformerMixin):
3322
3322
  # index overrites from result_features
3323
3323
  original_index_name = df_with_original_index.index.name
3324
3324
  df_with_original_index = df_with_original_index.reset_index()
3325
+ # TODO drop system_record_id before merge
3325
3326
  result_features = pd.merge(
3326
3327
  df_with_original_index,
3327
3328
  result_features,
upgini/metrics.py CHANGED
@@ -526,7 +526,7 @@ class CatBoostWrapper(EstimatorWrapper):
526
526
  emb_name = "__grouped_embeddings"
527
527
  df = df.copy()
528
528
  df[self.emb_features] = df[self.emb_features].fillna(0.0)
529
- df[emb_name] = df[self.emb_features].values.tolist()
529
+ df[emb_name] = pd.Series(df[self.emb_features].values.tolist())
530
530
  df = df.drop(columns=self.emb_features)
531
531
 
532
532
  return df, [emb_name]
@@ -96,7 +96,7 @@ invalid_date_format=Failed to parse date in column `{}`. Try to pass explicit da
96
96
  unsupported_date_type=Unsupported type of date column `{}`. Convert to datetime please.
97
97
  invalid_postal_code=All values of POSTAL_CODE column `{}` are invalid
98
98
  invalid_country=All values of COUNTRY column `{}` are invalid
99
- invalid_ip=All values of IPv4 column `{}` are invalid
99
+ invalid_ip=All values of IP column `{}` are invalid
100
100
  # X and y validation
101
101
  unsupported_x_type=Unsupported type of X: {}. Use pandas.DataFrame, pandas.Series or numpy.ndarray or list
102
102
  x_contains_dup_columns=X contains duplicate column names. Please rename or drop duplicates
@@ -25,12 +25,11 @@ def remove_fintech_duplicates(
25
25
  silent=False,
26
26
  bundle: ResourceBundle = None,
27
27
  ) -> pd.DataFrame:
28
- # Base checks
28
+ # Initial checks for target type and date column
29
29
  date_col = _get_column_by_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
30
30
  if define_task(df[TARGET], date_col is not None, silent=True) != ModelTaskType.BINARY:
31
31
  return df
32
32
 
33
- date_col = _get_column_by_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
34
33
  if date_col is None:
35
34
  return df
36
35
 
@@ -47,97 +46,103 @@ def remove_fintech_duplicates(
47
46
  if len(personal_cols) == 0:
48
47
  return df
49
48
 
50
- sub_df = df[personal_cols + [date_col, TARGET]]
51
-
52
- # Fast check for duplicates by personal keys
53
- if not sub_df[personal_cols].duplicated().any():
54
- return df
55
-
56
- grouped_by_personal_cols = sub_df.groupby(personal_cols, group_keys=False)
57
-
58
- # counts of diff dates by set of personal keys
59
- uniques = grouped_by_personal_cols[date_col].nunique()
60
- total = len(uniques)
61
- diff_dates = len(uniques[uniques > 1])
62
- if diff_dates / total >= 0.6:
63
- return df
64
-
65
- # Additional checks
66
-
67
- duplicates = sub_df.duplicated(personal_cols, keep=False)
68
- duplicate_rows = sub_df[duplicates]
69
- if len(duplicate_rows) == 0:
70
- return df
71
-
72
- # if there is no different target values in personal keys duplicate rows
73
- nonunique_target_groups = grouped_by_personal_cols[TARGET].nunique() > 1
74
- if nonunique_target_groups.sum() == 0:
75
- return df
76
-
77
- def has_diff_target_within_60_days(rows):
78
- rows = rows.sort_values(by=date_col)
79
- return len(rows[rows[TARGET].ne(rows[TARGET].shift()) & (rows[date_col].diff() < 60 * 24 * 60 * 60 * 1000)]) > 0
80
-
81
- nonunique_target_rows = nonunique_target_groups[nonunique_target_groups].reset_index().drop(columns=TARGET)
82
- sub_df = pd.merge(sub_df, nonunique_target_rows, on=personal_cols)
83
-
84
- sub_df = DateTimeSearchKeyConverter(date_col, date_format=date_format, logger=logger, bundle=bundle).convert(sub_df)
85
- grouped_by_personal_cols = sub_df.groupby(personal_cols, group_keys=False)
86
- rows_with_diff_target = grouped_by_personal_cols.filter(has_diff_target_within_60_days)
87
- if len(rows_with_diff_target) > 0:
88
- unique_keys_to_delete = rows_with_diff_target[personal_cols].drop_duplicates()
89
- if EVAL_SET_INDEX not in df.columns:
90
- rows_to_remove = pd.merge(df.reset_index(), unique_keys_to_delete, on=personal_cols)
91
- rows_to_remove = rows_to_remove.set_index(df.index.name or "index")
92
- perc = len(rows_to_remove) * 100 / len(df)
93
- msg = bundle.get("dataset_train_diff_target_duplicates_fintech").format(
94
- perc, len(rows_to_remove), rows_to_remove.index.to_list()
95
- )
96
- if not silent:
97
- print(msg)
98
- if logger:
99
- logger.warning(msg)
100
- logger.info(f"Dataset shape before clean fintech duplicates: {df.shape}")
101
- df = df[~df.index.isin(rows_to_remove.index)]
102
- logger.info(f"Dataset shape after clean fintech duplicates: {df.shape}")
103
- else:
104
- # Indices in train and eval_set can be the same so we remove rows from them separately
105
- train = df.query(f"{EVAL_SET_INDEX} == 0")
106
- train_rows_to_remove = pd.merge(train.reset_index(), unique_keys_to_delete, on=personal_cols)
107
- train_rows_to_remove = train_rows_to_remove.set_index(train.index.name or "index")
108
- train_perc = len(train_rows_to_remove) * 100 / len(train)
109
- msg = bundle.get("dataset_train_diff_target_duplicates_fintech").format(
110
- train_perc, len(train_rows_to_remove), train_rows_to_remove.index.to_list()
49
+ # Splitting into train and eval_set parts
50
+ if EVAL_SET_INDEX in df.columns:
51
+ train_df = df[df[EVAL_SET_INDEX] == 0]
52
+ eval_dfs = [df[df[EVAL_SET_INDEX] == idx] for idx in df[EVAL_SET_INDEX].unique() if idx != 0]
53
+ else:
54
+ train_df = df
55
+ eval_dfs = []
56
+
57
+ def process_df(segment_df: pd.DataFrame, eval_index=0) -> pd.DataFrame:
58
+ """Process a subset of the dataset to remove duplicates based on personal keys."""
59
+ # Fast check for duplicates based on personal keys
60
+ if not segment_df[personal_cols].duplicated().any():
61
+ return segment_df
62
+
63
+ sub_df = segment_df[personal_cols + [date_col, TARGET]].copy()
64
+
65
+ # Group by personal columns to check for unique dates
66
+ grouped_by_personal_cols = sub_df.groupby(personal_cols, group_keys=False)
67
+
68
+ # Checking for different dates by the same personal keys
69
+ uniques = grouped_by_personal_cols[date_col].nunique()
70
+ total = len(uniques)
71
+ diff_dates = len(uniques[uniques > 1])
72
+ if diff_dates / total >= 0.6:
73
+ return segment_df
74
+
75
+ # Check for duplicate rows
76
+ duplicates = sub_df.duplicated(personal_cols, keep=False)
77
+ duplicate_rows = sub_df[duplicates]
78
+ if len(duplicate_rows) == 0:
79
+ return segment_df
80
+
81
+ # Check if there are different target values for the same personal keys
82
+ nonunique_target_groups = grouped_by_personal_cols[TARGET].nunique() > 1
83
+ if nonunique_target_groups.sum() == 0:
84
+ return segment_df
85
+
86
+ # Helper function to check if there are different target values within 60 days
87
+ def has_diff_target_within_60_days(rows: pd.DataFrame):
88
+ rows = rows.sort_values(by=date_col)
89
+ return (
90
+ len(rows[rows[TARGET].ne(rows[TARGET].shift()) & (rows[date_col].diff() < 60 * 24 * 60 * 60 * 1000)])
91
+ > 0
111
92
  )
93
+
94
+ # Filter rows with different target values within 60 days
95
+ nonunique_target_rows = nonunique_target_groups[nonunique_target_groups].reset_index().drop(columns=TARGET)
96
+ sub_df = pd.merge(sub_df, nonunique_target_rows, on=personal_cols)
97
+
98
+ # Convert date columns for further checks
99
+ sub_df = DateTimeSearchKeyConverter(date_col, date_format=date_format, logger=logger, bundle=bundle).convert(
100
+ sub_df
101
+ )
102
+ grouped_by_personal_cols = sub_df.groupby(personal_cols, group_keys=False)
103
+ rows_with_diff_target = grouped_by_personal_cols.filter(has_diff_target_within_60_days)
104
+
105
+ if len(rows_with_diff_target) > 0:
106
+ unique_keys_to_delete = rows_with_diff_target[personal_cols].drop_duplicates()
107
+ rows_to_remove = pd.merge(segment_df.reset_index(), unique_keys_to_delete, on=personal_cols)
108
+ rows_to_remove = rows_to_remove.set_index(segment_df.index.name or "index")
109
+ perc = len(rows_to_remove) * 100 / len(segment_df)
110
+ if eval_index == 0:
111
+ msg = bundle.get("dataset_train_diff_target_duplicates_fintech").format(
112
+ perc, len(rows_to_remove), rows_to_remove.index.to_list()
113
+ )
114
+ else:
115
+ msg = bundle.get("dataset_eval_diff_target_duplicates_fintech").format(
116
+ perc, len(rows_to_remove), eval_index, rows_to_remove.index.to_list()
117
+ )
112
118
  if not silent:
113
119
  print(msg)
114
120
  if logger:
115
121
  logger.warning(msg)
116
- logger.info(f"Train dataset shape before clean fintech duplicates: {train.shape}")
117
- train = train[~train.index.isin(train_rows_to_remove.index)]
118
- logger.info(f"Train dataset shape after clean fintech duplicates: {train.shape}")
119
-
120
- evals = [df.query(f"{EVAL_SET_INDEX} == {i}") for i in df[EVAL_SET_INDEX].unique() if i != 0]
121
- new_evals = []
122
- for i, eval in enumerate(evals):
123
- eval_rows_to_remove = pd.merge(eval.reset_index(), unique_keys_to_delete, on=personal_cols)
124
- eval_rows_to_remove = eval_rows_to_remove.set_index(eval.index.name or "index")
125
- eval_perc = len(eval_rows_to_remove) * 100 / len(eval)
126
- msg = bundle.get("dataset_eval_diff_target_duplicates_fintech").format(
127
- eval_perc, len(eval_rows_to_remove), i + 1, eval_rows_to_remove.index.to_list()
128
- )
129
- if not silent:
130
- print(msg)
131
- if logger:
132
- logger.warning(msg)
133
- logger.info(f"Eval {i + 1} dataset shape before clean fintech duplicates: {eval.shape}")
134
- eval = eval[~eval.index.isin(eval_rows_to_remove.index)]
135
- logger.info(f"Eval {i + 1} dataset shape after clean fintech duplicates: {eval.shape}")
136
- new_evals.append(eval)
137
-
138
- logger.info(f"Dataset shape before clean fintech duplicates: {df.shape}")
139
- df = pd.concat([train] + new_evals)
140
- logger.info(f"Dataset shape after clean fintech duplicates: {df.shape}")
122
+ return segment_df[~segment_df.index.isin(rows_to_remove.index)]
123
+ return segment_df
124
+
125
+ # Process the train part separately
126
+ logger.info(f"Train dataset shape before clean fintech duplicates: {train_df.shape}")
127
+ train_df = process_df(train_df)
128
+ logger.info(f"Train dataset shape after clean fintech duplicates: {train_df.shape}")
129
+
130
+ # Process each eval_set part separately
131
+ new_eval_dfs = []
132
+ for i, eval_df in enumerate(eval_dfs, 1):
133
+ logger.info(f"Eval {i} dataset shape before clean fintech duplicates: {eval_df.shape}")
134
+ cleaned_eval_df = process_df(eval_df, i)
135
+ logger.info(f"Eval {i} dataset shape after clean fintech duplicates: {cleaned_eval_df.shape}")
136
+ new_eval_dfs.append(cleaned_eval_df)
137
+
138
+ # Combine the processed train and eval parts back into one dataset
139
+ logger.info(f"Dataset shape before clean fintech duplicates: {df.shape}")
140
+ if new_eval_dfs:
141
+ df = pd.concat([train_df] + new_eval_dfs)
142
+ else:
143
+ df = train_df
144
+ logger.info(f"Dataset shape after clean fintech duplicates: {df.shape}")
145
+
141
146
  return df
142
147
 
143
148
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: upgini
3
- Version: 1.2.12
3
+ Version: 1.2.13
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -1,13 +1,13 @@
1
- upgini/__about__.py,sha256=dbW85A2PinQCZabwD2DNDTfOE9315GDtQQKAsJP8IXk,23
1
+ upgini/__about__.py,sha256=rQSlPcfj4yT4krIq6epTVQyBzIX4etVOgfupVkM-RnU,23
2
2
  upgini/__init__.py,sha256=M64LwQTBa-5Jz24Zm2h8rWwlKQQ1J8nP7gGgIciS0WU,589
3
3
  upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
4
4
  upgini/dataset.py,sha256=olZ-OHSfBNoBSCo7R5t7uCLukI2nO7afpx_A-HCiJLk,31067
5
5
  upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
6
- upgini/features_enricher.py,sha256=eRkI2qpV-IprB1dQAMxzto6I6Q3b3SBuDMVR1_OFlyA,188008
6
+ upgini/features_enricher.py,sha256=HJJZbZScVrl6ugDBQE71m7om5-ahvMyEnAqZNw-OEJ0,188058
7
7
  upgini/http.py,sha256=21asexflvavydzCOONJDGQBtQanCElrbnqLXakJ9Cu8,42880
8
8
  upgini/lazy_import.py,sha256=74gQ8JuA48BGRLxAo7lNHNKY2D2emMxrUxKGdxVGhuY,1012
9
9
  upgini/metadata.py,sha256=osmzdNESeh7yP3BZday6N9Q3eaIHfzhhRM1d6NSgcf0,11223
10
- upgini/metrics.py,sha256=aKJwAYUGNRdiz9z-bxDxs4jGZQ_VkPXa7sZ52C0VpVI,31243
10
+ upgini/metrics.py,sha256=bgi1rc3vCCeCuwRX1doQSQCzaV5OEiYHv_6XIvapnaw,31254
11
11
  upgini/search_task.py,sha256=qxUxAD-bed-FpZYmTB_4orW7YJsW_O6a1TcgnZIRFr4,17307
12
12
  upgini/spinner.py,sha256=4iMd-eIe_BnkqFEMIliULTbj6rNI2HkN_VJ4qYe0cUc,1118
13
13
  upgini/version_validator.py,sha256=ddSKUK_-eGJB3NgrqOMoWJU-OxQ253WsNLp8aqJkaIM,1389
@@ -15,7 +15,7 @@ upgini/ads_management/__init__.py,sha256=qzyisOToVRP-tquAJD1PblZhNtMrOB8FiyF9Jvf
15
15
  upgini/ads_management/ads_manager.py,sha256=igVbN2jz80Umb2BUJixmJVj-zx8unoKpecVo-R-nGdw,2648
16
16
  upgini/autofe/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
17
17
  upgini/autofe/all_operands.py,sha256=cCCB44qvkmuWyiRM5Xykx8tkHPIjQthrWyj67STWN80,2578
18
- upgini/autofe/binary.py,sha256=TRjEdxsfyPY5E8ksYfdKMmU6GtvALfGFPNVIG7DBhzM,7520
18
+ upgini/autofe/binary.py,sha256=zMhtHVuGUAFLUqem-XiXqJj-GRXxS88tdz8tFuDfSNM,7659
19
19
  upgini/autofe/date.py,sha256=OpFc3Al0xO3qlESn2Uokfxw51ArVqmh3xngWwdrsaqE,9762
20
20
  upgini/autofe/feature.py,sha256=eL7wABUhDKZzv3E-RPJNcyGwSfB0UptcfU2RbvsOks4,15082
21
21
  upgini/autofe/groupby.py,sha256=r-xl_keZZgm_tpiEoDhjYSkT6NHv7a4cRQR4wJ4uCp8,3263
@@ -30,7 +30,7 @@ upgini/normalizer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU
30
30
  upgini/normalizer/normalize_utils.py,sha256=bHRPWCNrUvt2R9qMX6dZFCJ0i8ENVCQ2Rw3dHH9IJEg,7447
31
31
  upgini/resource_bundle/__init__.py,sha256=S5F2G47pnJd2LDpmFsjDqEwiKkP8Hm-hcseDbMka6Ko,8345
32
32
  upgini/resource_bundle/exceptions.py,sha256=5fRvx0_vWdE1-7HcSgF0tckB4A9AKyf5RiinZkInTsI,621
33
- upgini/resource_bundle/strings.properties,sha256=hWldMqtv80lwv8HV00Hk2-3tflu4BkD6tiXOfGDZPl8,26458
33
+ upgini/resource_bundle/strings.properties,sha256=9kvmcUrsSFUCrzOiN0Ozf-lQ2H8Igz5gATUPoHMOaU4,26456
34
34
  upgini/resource_bundle/strings_widget.properties,sha256=gOdqvZWntP2LCza_tyVk1_yRYcG4c04K9sQOAVhF_gw,1577
35
35
  upgini/sampler/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
36
36
  upgini/sampler/base.py,sha256=7GpjYqjOp58vYcJLiX__1R5wjUlyQbxvHJ2klFnup_M,6389
@@ -43,7 +43,7 @@ upgini/utils/country_utils.py,sha256=lY-eXWwFVegdVENFttbvLcgGDjFO17Sex8hd2PyJaRk
43
43
  upgini/utils/custom_loss_utils.py,sha256=kieNZYBYZm5ZGBltF1F_jOSF4ea6C29rYuCyiDcqVNY,3857
44
44
  upgini/utils/cv_utils.py,sha256=w6FQb9nO8BWDx88EF83NpjPLarK4eR4ia0Wg0kLBJC4,3525
45
45
  upgini/utils/datetime_utils.py,sha256=4tsGeehU0KS6wqNsc9gEEWZ9s6T9E0UReUIO3rSuXNU,12174
46
- upgini/utils/deduplicate_utils.py,sha256=Zvs7zW4QzaERQmJNPrTVf2ZTVBkBLOycFCzyMwtXuV8,8770
46
+ upgini/utils/deduplicate_utils.py,sha256=NpaPtBYXwUtfKTRHWrtz2uUq6tZN6C_Nd719ydPRF2Q,8484
47
47
  upgini/utils/display_utils.py,sha256=A2ouB5eiZ-Kyt9ykYxkLQwyoRPrdYeJymwNTiajtFXs,10990
48
48
  upgini/utils/email_utils.py,sha256=j0Ug1R_0AnCg1Y92zIZ4XMwvKo3G5_pcOlBN1OH_gZs,5191
49
49
  upgini/utils/fallback_progress_bar.py,sha256=PDaKb8dYpVZaWMroNcOHsTc3pSjgi9mOm0--cOFTwJ0,1074
@@ -57,7 +57,7 @@ upgini/utils/sklearn_ext.py,sha256=13jQS_k7v0aUtudXV6nGUEWjttPQzAW9AFYL5wgEz9k,4
57
57
  upgini/utils/target_utils.py,sha256=BVtDmrmFMKerSUWaNOIEdzsYHIFiODdpnWbE50QDPDc,7864
58
58
  upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
59
59
  upgini/utils/warning_counter.py,sha256=dIWBB4dI5XRRJZudvIlqlIYKEiwLLPcXarsZuYRt338,227
60
- upgini-1.2.12.dist-info/METADATA,sha256=k_J1xVbmpvm56wJ_hDo17cEK6rXRhhqJp3rSbw233xA,48577
61
- upgini-1.2.12.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
62
- upgini-1.2.12.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
63
- upgini-1.2.12.dist-info/RECORD,,
60
+ upgini-1.2.13.dist-info/METADATA,sha256=IRJWMi0M4nUgCqMwp4kffx8QXgR1DJ2VsqH5Y7-nQ2E,48577
61
+ upgini-1.2.13.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
62
+ upgini-1.2.13.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
63
+ upgini-1.2.13.dist-info/RECORD,,