upgini 1.1.244a13__py3-none-any.whl → 1.1.244a15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

upgini/dataset.py CHANGED
@@ -36,14 +36,12 @@ from upgini.metadata import (
36
36
  NumericInterval,
37
37
  RuntimeParameters,
38
38
  SearchCustomization,
39
- SearchKey,
40
39
  )
41
40
  from upgini.normalizer.phone_normalizer import PhoneNormalizer
42
41
  from upgini.resource_bundle import bundle
43
42
  from upgini.sampler.random_under_sampler import RandomUnderSampler
44
43
  from upgini.search_task import SearchTask
45
44
  from upgini.utils import combine_search_keys
46
- from upgini.utils.deduplicate_utils import remove_fintech_duplicates
47
45
  from upgini.utils.email_utils import EmailSearchKeyConverter
48
46
 
49
47
  try:
@@ -817,19 +815,19 @@ class Dataset: # (pd.DataFrame):
817
815
 
818
816
  self.__convert_features_types()
819
817
 
820
- search_keys = {
821
- col: SearchKey.from_meaning_type(key_type)
822
- for col, key_type in self.meaning_types.items()
823
- if SearchKey.from_meaning_type(key_type) is not None
824
- }
818
+ # search_keys = {
819
+ # col: SearchKey.from_meaning_type(key_type)
820
+ # for col, key_type in self.meaning_types.items()
821
+ # if SearchKey.from_meaning_type(key_type) is not None
822
+ # }
825
823
 
826
- if validate_target:
827
- need_full_defuplication, self.data = remove_fintech_duplicates(self.data, search_keys, self.logger)
828
- else:
829
- need_full_defuplication = True
824
+ # if validate_target:
825
+ # need_full_defuplication, self.data = remove_fintech_duplicates(self.data, search_keys, self.logger)
826
+ # else:
827
+ # need_full_defuplication = True
830
828
 
831
- if need_full_defuplication:
832
- self.__clean_duplicates(silent_mode)
829
+ # if need_full_defuplication:
830
+ # self.__clean_duplicates(silent_mode)
833
831
 
834
832
  self.__validate_dataset(validate_target, silent_mode)
835
833
 
@@ -65,7 +65,10 @@ from upgini.utils.datetime_utils import (
65
65
  is_blocked_time_series,
66
66
  is_time_series,
67
67
  )
68
- from upgini.utils.deduplicate_utils import remove_fintech_duplicates
68
+ from upgini.utils.deduplicate_utils import (
69
+ clean_full_duplicates,
70
+ remove_fintech_duplicates,
71
+ )
69
72
  from upgini.utils.display_utils import (
70
73
  display_html_dataframe,
71
74
  do_without_pandas_limits,
@@ -1850,6 +1853,8 @@ class FeaturesEnricher(TransformerMixin):
1850
1853
  )
1851
1854
  meaning_types[SYSTEM_RECORD_ID] = FileColumnMeaningType.SYSTEM_RECORD_ID
1852
1855
 
1856
+ df = clean_full_duplicates(df, self.logger, silent=silent_mode)
1857
+
1853
1858
  df = df.reset_index(drop=True)
1854
1859
  system_columns_with_original_index = [SYSTEM_RECORD_ID] + generated_features
1855
1860
  df_with_original_index = df[system_columns_with_original_index].copy()
@@ -2131,6 +2136,10 @@ class FeaturesEnricher(TransformerMixin):
2131
2136
 
2132
2137
  df = self.__add_country_code(df, self.fit_search_keys)
2133
2138
 
2139
+ need_full_defuplication, df = remove_fintech_duplicates(df, self.fit_search_keys, self.logger)
2140
+ if need_full_defuplication:
2141
+ df = clean_full_duplicates(df, self.logger)
2142
+
2134
2143
  date_column = self._get_date_column(self.fit_search_keys)
2135
2144
  self.__adjust_cv(df, date_column, model_task_type)
2136
2145
 
upgini/metrics.py CHANGED
@@ -203,6 +203,7 @@ class EstimatorWrapper:
203
203
  add_params: Optional[Dict[str, Any]] = None,
204
204
  groups: Optional[np.ndarray] = None,
205
205
  text_features: Optional[List[str]] = None,
206
+ logger: Optional[logging.Logger] = None,
206
207
  ):
207
208
  self.estimator = estimator
208
209
  self.scorer = scorer
@@ -216,6 +217,7 @@ class EstimatorWrapper:
216
217
  self.cv_estimators = None
217
218
  self.groups = groups
218
219
  self.text_features = text_features
220
+ self.logger = logger or logging.getLogger()
219
221
 
220
222
  def fit(self, X: pd.DataFrame, y: np.ndarray, **kwargs):
221
223
  X, y, _, fit_params = self._prepare_to_fit(X, y)
@@ -411,13 +413,14 @@ class CatBoostWrapper(EstimatorWrapper):
411
413
  emb_pattern = r"(.+)_emb\d+"
412
414
  self.emb_features = [c for c in X.columns if re.match(emb_pattern, c) and is_numeric_dtype(X[c])]
413
415
  embedding_features = []
414
- if len(self.emb_features) > 1:
416
+ if len(self.emb_features) > 3: # There is no reason to reduce embeddings dimension with less than 4
415
417
  X, embedding_features = self.group_embeddings(X)
416
418
  params["embedding_features"] = embedding_features
417
419
 
418
420
  # Find text features from passed in generate_features
419
421
  if self.text_features is not None:
420
422
  self.text_features = [f for f in self.text_features if not is_numeric_dtype(X[f])]
423
+
421
424
  params["text_features"] = self.text_features
422
425
 
423
426
  # Find rest categorical features
@@ -3,7 +3,7 @@ from typing import Dict, List, Optional, Tuple, Union
3
3
 
4
4
  import pandas as pd
5
5
 
6
- from upgini.metadata import TARGET, ModelTaskType, SearchKey
6
+ from upgini.metadata import SYSTEM_RECORD_ID, TARGET, ModelTaskType, SearchKey
7
7
  from upgini.resource_bundle import bundle
8
8
  from upgini.utils.datetime_utils import DateTimeSearchKeyConverter
9
9
  from upgini.utils.target_utils import define_task
@@ -35,8 +35,15 @@ def remove_fintech_duplicates(
35
35
  if len(personal_cols) == 0:
36
36
  return need_full_deduplication, df
37
37
 
38
- grouped_by_personal_cols = df.groupby(personal_cols, group_keys=False)
38
+ sub_df = df[personal_cols + [date_col, TARGET]]
39
39
 
40
+ # Fast check for duplicates by personal keys
41
+ if not sub_df[personal_cols].duplicated().any():
42
+ return need_full_deduplication, df
43
+
44
+ grouped_by_personal_cols = sub_df.groupby(personal_cols, group_keys=False)
45
+
46
+ # counts of diff dates by set of personal keys
40
47
  uniques = grouped_by_personal_cols[date_col].nunique()
41
48
  total = len(uniques)
42
49
  diff_dates = len(uniques[uniques > 1])
@@ -47,35 +54,83 @@ def remove_fintech_duplicates(
47
54
 
48
55
  need_full_deduplication = False
49
56
 
50
- duplicates = df.duplicated(personal_cols, keep=False)
51
- duplicate_rows = df[duplicates]
57
+ duplicates = sub_df.duplicated(personal_cols, keep=False)
58
+ duplicate_rows = sub_df[duplicates]
52
59
  if len(duplicate_rows) == 0:
53
60
  return need_full_deduplication, df
54
61
 
55
- if grouped_by_personal_cols[TARGET].apply(lambda x: len(x.unique()) == 1).all():
62
+ # if there is no different target values in personal keys duplicate rows
63
+ nonunique_target_groups = grouped_by_personal_cols[TARGET].nunique() > 1
64
+ if nonunique_target_groups.sum() == 0:
56
65
  return need_full_deduplication, df
57
66
 
58
67
  def has_diff_target_within_60_days(rows):
59
68
  rows = rows.sort_values(by=date_col)
60
69
  return len(rows[rows[TARGET].ne(rows[TARGET].shift()) & (rows[date_col].diff() < 60 * 24 * 60 * 60 * 1000)]) > 0
61
70
 
62
- df = DateTimeSearchKeyConverter(date_col).convert(df)
63
- grouped_by_personal_cols = df.groupby(personal_cols, group_keys=False)
71
+ nonunique_target_rows = nonunique_target_groups[nonunique_target_groups].reset_index().drop(columns=TARGET)
72
+ sub_df = pd.merge(sub_df, nonunique_target_rows, on=personal_cols)
73
+
74
+ sub_df = DateTimeSearchKeyConverter(date_col).convert(sub_df)
75
+ grouped_by_personal_cols = sub_df.groupby(personal_cols, group_keys=False)
64
76
  rows_with_diff_target = grouped_by_personal_cols.filter(has_diff_target_within_60_days)
65
77
  if len(rows_with_diff_target) > 0:
66
- perc = len(rows_with_diff_target) * 100 / len(df)
78
+ unique_keys_to_delete = rows_with_diff_target[personal_cols].drop_duplicates()
79
+ rows_to_remove = pd.merge(df.reset_index(), unique_keys_to_delete, on=personal_cols)
80
+ rows_to_remove = rows_to_remove.set_index(df.index.name or "index")
81
+ perc = len(rows_to_remove) * 100 / len(df)
67
82
  msg = bundle.get("dataset_diff_target_duplicates_fintech").format(
68
- perc, len(rows_with_diff_target), rows_with_diff_target.index.to_list()
83
+ perc, len(rows_to_remove), rows_to_remove.index.to_list()
69
84
  )
70
85
  if not silent:
71
86
  print(msg)
72
87
  if logger:
73
88
  logger.warning(msg)
74
- df = df[~df.index.isin(rows_with_diff_target.index)]
89
+ logger.info(f"Dataset shape before clean fintech duplicates: {df.shape}")
90
+ df = df[~df.index.isin(rows_to_remove.index)]
91
+ logger.info(f"Dataset shape after clean fintech duplicates: {df.shape}")
75
92
 
76
93
  return need_full_deduplication, df
77
94
 
78
95
 
96
+ def clean_full_duplicates(
97
+ df: pd.DataFrame, logger: Optional[Logger] = None, silent=False
98
+ ) -> pd.DataFrame:
99
+ nrows = len(df)
100
+ if nrows == 0:
101
+ return
102
+ # Remove absolute duplicates (exclude system_record_id)
103
+ unique_columns = df.columns.tolist()
104
+ if SYSTEM_RECORD_ID in unique_columns:
105
+ unique_columns.remove(SYSTEM_RECORD_ID)
106
+ logger.info(f"Dataset shape before clean duplicates: {df.shape}")
107
+ df = df.drop_duplicates(subset=unique_columns)
108
+ logger.info(f"Dataset shape after clean duplicates: {df.shape}")
109
+ nrows_after_full_dedup = len(df)
110
+ share_full_dedup = 100 * (1 - nrows_after_full_dedup / nrows)
111
+ if share_full_dedup > 0:
112
+ msg = bundle.get("dataset_full_duplicates").format(share_full_dedup)
113
+ logger.warning(msg)
114
+ # if not silent_mode:
115
+ # print(msg)
116
+ # self.warning_counter.increment()
117
+ if TARGET in df.columns:
118
+ unique_columns.remove(TARGET)
119
+ marked_duplicates = df.duplicated(subset=unique_columns, keep=False)
120
+ if marked_duplicates.sum() > 0:
121
+ dups_indices = df[marked_duplicates].index.to_list()
122
+ nrows_after_tgt_dedup = len(df.drop_duplicates(subset=unique_columns))
123
+ num_dup_rows = nrows_after_full_dedup - nrows_after_tgt_dedup
124
+ share_tgt_dedup = 100 * num_dup_rows / nrows_after_full_dedup
125
+
126
+ msg = bundle.get("dataset_diff_target_duplicates").format(share_tgt_dedup, num_dup_rows, dups_indices)
127
+ logger.warning(msg)
128
+ if not silent:
129
+ print(msg)
130
+ df = df.drop_duplicates(subset=unique_columns, keep=False)
131
+ logger.info(f"Dataset shape after clean invalid target duplicates: {df.shape}")
132
+
133
+
79
134
  def _get_column_by_key(search_keys: Dict[str, SearchKey], keys: Union[SearchKey, List[SearchKey]]) -> Optional[str]:
80
135
  for col, key_type in search_keys.items():
81
136
  if (isinstance(keys, list) and key_type in keys) or key_type == keys:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: upgini
3
- Version: 1.1.244a13
3
+ Version: 1.1.244a15
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Home-page: https://upgini.com/
6
6
  Author: Upgini Developers
@@ -1,12 +1,12 @@
1
1
  upgini/__init__.py,sha256=asENHgEVHQBIkV-e_0IhE_ZWqkCG6398U3ZLrNzAH6k,407
2
2
  upgini/ads.py,sha256=mre6xn44wcC_fg63iLT_kTh4mViZqR9AKRJZAtpQz8Y,2592
3
- upgini/dataset.py,sha256=2oOmBe8_mpwJ8Fw14gw4uZ1GgLU4PtjozkXhvIXhRq0,50022
3
+ upgini/dataset.py,sha256=WGpnmpnmfdyB2DAwaj7mkk2s0e-6Z6bg5BWj1lUE2p0,49960
4
4
  upgini/errors.py,sha256=pdzQl3MKuK52yvncxMWMRWeSIOGhUFzpQoszoRFBOk0,958
5
- upgini/features_enricher.py,sha256=0dcpk0jFmvDrMmjMD2XlJhiW7la8YCKBbWEGJQSA7Uc,165283
5
+ upgini/features_enricher.py,sha256=a3RBqMMxY3lH6bkvc20I3zyL5oQF3VUDUIvwqgtzxxA,165592
6
6
  upgini/fingerprint.js,sha256=VygVIQlN1v4NGZfjHqtRogOw8zjTnnMNJg_f7M5iGQU,33442
7
7
  upgini/http.py,sha256=eSG4gOpmCGlXmB6KIPNzAG8tRZNUjyYpMeUeHw_2li4,42264
8
8
  upgini/metadata.py,sha256=55t0uQI910tzTcnwxZCUL1413BhTiSm8oqiwp-94NyA,9613
9
- upgini/metrics.py,sha256=3LP_7yo1LYCllxI5E_eorrcTTX2MTkSsQwydQTlenbo,25952
9
+ upgini/metrics.py,sha256=BCEotBr4_PCfUheswZ_FPAj6Lk_P-iyl9Qfi8WqdbqY,26136
10
10
  upgini/search_task.py,sha256=5n4qGJmtu48s0-FHAtF3L5qVLMd1JVW3FJlM8dFbh-s,17063
11
11
  upgini/spinner.py,sha256=Dm1dQ5F_z_Ua2odLxZX7OypcOX9tSx_vE5MGaKtUmfw,1118
12
12
  upgini/version_validator.py,sha256=rDIncP6BEko4J2F2hUcMOtKm_vZbI4ICWcNcw8hrwM4,1400
@@ -40,7 +40,7 @@ upgini/utils/country_utils.py,sha256=1KXhLSNqkNYVL3on8-zK0Arc_SspUH7AMZvGZICysOU
40
40
  upgini/utils/custom_loss_utils.py,sha256=DBslpjWGPt7xTeypt78baR59012SYphbPsO_YLKdilo,3972
41
41
  upgini/utils/cv_utils.py,sha256=6pSSL_Ft_8C6n6aInJeiyeSBD7McjsMxKZpHqSBV0uY,2491
42
42
  upgini/utils/datetime_utils.py,sha256=awsLpnFjBNcrsCDyyiiJLicHgHiGCNAwi0UOwRKGD7s,8645
43
- upgini/utils/deduplicate_utils.py,sha256=nFRHUanDqCURk1tF7nuLzHqmpo8pJOW-UMEj_3PTBDg,3083
43
+ upgini/utils/deduplicate_utils.py,sha256=OxJ3ygvRQL5H_h2Kn0mwRaj5Ux8FmCQ8ZV4YQvSRZyw,5794
44
44
  upgini/utils/display_utils.py,sha256=tiq5sFOfMwkKCjQ7OGdyK_twe0Qdr9F3mzkW1QXSDog,10664
45
45
  upgini/utils/email_utils.py,sha256=3CvHXTSzlgLyGsQOXfRYVfFhfPy6OXG4uXOBWRaLfHg,3479
46
46
  upgini/utils/fallback_progress_bar.py,sha256=cdbd1XGcWm4Ed4eAqV2_St3z7uC_kkH22gEyrN5ub6M,1090
@@ -54,8 +54,8 @@ upgini/utils/sklearn_ext.py,sha256=fvuTWJ5AnT3ED9KSaQu_yIgW2JR19hFlaGDoVP3k60g,4
54
54
  upgini/utils/target_utils.py,sha256=qyj-bGsIEl9X2Vc5gwXtsuRaocvId8bn46F7mZ9dy9A,1707
55
55
  upgini/utils/track_info.py,sha256=EPcJ13Jqa17_T0JjM37Ac9kWDz5Zk0GVsIZKutOb8aU,5207
56
56
  upgini/utils/warning_counter.py,sha256=dIWBB4dI5XRRJZudvIlqlIYKEiwLLPcXarsZuYRt338,227
57
- upgini-1.1.244a13.dist-info/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
58
- upgini-1.1.244a13.dist-info/METADATA,sha256=8GXzJ6Sos2jMZZar9rcMgjyyIUPly2H0Yxqv4Gup9iw,48265
59
- upgini-1.1.244a13.dist-info/WHEEL,sha256=pkctZYzUS4AYVn6dJ-7367OJZivF2e8RA9b_ZBjif18,92
60
- upgini-1.1.244a13.dist-info/top_level.txt,sha256=OFhTGiDIWKl5gFI49qvWq1R9IKflPaE2PekcbDXDtx4,7
61
- upgini-1.1.244a13.dist-info/RECORD,,
57
+ upgini-1.1.244a15.dist-info/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
58
+ upgini-1.1.244a15.dist-info/METADATA,sha256=GslYuWCFvWkrO6G5g88d5yIzv5nqe4OdOqyVHHEKO0k,48265
59
+ upgini-1.1.244a15.dist-info/WHEEL,sha256=pkctZYzUS4AYVn6dJ-7367OJZivF2e8RA9b_ZBjif18,92
60
+ upgini-1.1.244a15.dist-info/top_level.txt,sha256=OFhTGiDIWKl5gFI49qvWq1R9IKflPaE2PekcbDXDtx4,7
61
+ upgini-1.1.244a15.dist-info/RECORD,,