upgini 1.2.22__py3-none-any.whl → 1.2.23__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

upgini/__about__.py CHANGED
@@ -1 +1 @@
1
- __version__ = "1.2.22"
1
+ __version__ = "1.2.23"
@@ -77,8 +77,8 @@ from upgini.utils.cv_utils import CVConfig, get_groups
77
77
  from upgini.utils.datetime_utils import (
78
78
  DateTimeSearchKeyConverter,
79
79
  is_blocked_time_series,
80
+ is_dates_distribution_valid,
80
81
  is_time_series,
81
- validate_dates_distribution,
82
82
  )
83
83
  from upgini.utils.deduplicate_utils import (
84
84
  clean_full_duplicates,
@@ -263,7 +263,7 @@ class FeaturesEnricher(TransformerMixin):
263
263
  dict()
264
264
  )
265
265
 
266
- validate_version(self.logger)
266
+ validate_version(self.logger, self.__log_warning)
267
267
  self.search_keys = search_keys or {}
268
268
  self.country_code = country_code
269
269
  self.__validate_search_keys(search_keys, search_id)
@@ -723,7 +723,7 @@ class FeaturesEnricher(TransformerMixin):
723
723
 
724
724
  start_time = time.time()
725
725
  try:
726
- result, _ = self.__inner_transform(
726
+ result, _, _ = self.__inner_transform(
727
727
  trace_id,
728
728
  X,
729
729
  exclude_features_sources=exclude_features_sources,
@@ -951,9 +951,7 @@ class FeaturesEnricher(TransformerMixin):
951
951
  gc.collect()
952
952
 
953
953
  if fitting_X.shape[1] == 0 and fitting_enriched_X.shape[1] == 0:
954
- print(self.bundle.get("metrics_no_important_free_features"))
955
- self.logger.warning("No client or free relevant ADS features found to calculate metrics")
956
- self.warning_counter.increment()
954
+ self.__log_warning(self.bundle.get("metrics_no_important_free_features"))
957
955
  return None
958
956
 
959
957
  print(self.bundle.get("metrics_start"))
@@ -1654,9 +1652,7 @@ class FeaturesEnricher(TransformerMixin):
1654
1652
  date_column = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
1655
1653
  generated_features = []
1656
1654
  if date_column is not None:
1657
- converter = DateTimeSearchKeyConverter(
1658
- date_column, self.date_format, self.logger, self.bundle, silent_mode=True
1659
- )
1655
+ converter = DateTimeSearchKeyConverter(date_column, self.date_format, self.logger, self.bundle)
1660
1656
  df = converter.convert(df, keep_time=True)
1661
1657
  generated_features = converter.generated_features
1662
1658
 
@@ -1666,11 +1662,11 @@ class FeaturesEnricher(TransformerMixin):
1666
1662
  df = generator.generate(df)
1667
1663
  generated_features.extend(generator.generated_features)
1668
1664
 
1669
- normalizer = Normalizer(self.bundle, self.logger, self.warning_counter)
1665
+ normalizer = Normalizer(self.bundle, self.logger)
1670
1666
  df, search_keys, generated_features = normalizer.normalize(df, search_keys, generated_features)
1671
1667
  columns_renaming = normalizer.columns_renaming
1672
1668
 
1673
- df = clean_full_duplicates(df, logger=self.logger, silent=True, bundle=self.bundle)
1669
+ df, _ = clean_full_duplicates(df, logger=self.logger, bundle=self.bundle)
1674
1670
 
1675
1671
  num_samples = _num_samples(df)
1676
1672
  sample_threshold, sample_rows = (
@@ -1817,7 +1813,7 @@ class FeaturesEnricher(TransformerMixin):
1817
1813
  eval_df_with_index[EVAL_SET_INDEX] = idx + 1
1818
1814
  df = pd.concat([df, eval_df_with_index])
1819
1815
 
1820
- df = clean_full_duplicates(df, logger=self.logger, silent=True, bundle=self.bundle)
1816
+ df, _ = clean_full_duplicates(df, logger=self.logger, bundle=self.bundle)
1821
1817
 
1822
1818
  # downsample if need to eval_set threshold
1823
1819
  num_samples = _num_samples(df)
@@ -1830,7 +1826,7 @@ class FeaturesEnricher(TransformerMixin):
1830
1826
  tmp_target_name = "__target"
1831
1827
  df = df.rename(columns={TARGET: tmp_target_name})
1832
1828
 
1833
- enriched_df, columns_renaming = self.__inner_transform(
1829
+ enriched_df, columns_renaming, generated_features = self.__inner_transform(
1834
1830
  trace_id,
1835
1831
  df,
1836
1832
  exclude_features_sources=exclude_features_sources,
@@ -1847,7 +1843,7 @@ class FeaturesEnricher(TransformerMixin):
1847
1843
 
1848
1844
  x_columns = [
1849
1845
  c
1850
- for c in (validated_X.columns.tolist() + self.fit_generated_features + [SYSTEM_RECORD_ID])
1846
+ for c in (validated_X.columns.tolist() + generated_features + [SYSTEM_RECORD_ID])
1851
1847
  if c in enriched_df.columns
1852
1848
  ]
1853
1849
 
@@ -1869,7 +1865,7 @@ class FeaturesEnricher(TransformerMixin):
1869
1865
 
1870
1866
  df[TARGET] = validated_y
1871
1867
 
1872
- df = clean_full_duplicates(df, logger=self.logger, silent=True, bundle=self.bundle)
1868
+ df, _ = clean_full_duplicates(df, logger=self.logger, bundle=self.bundle)
1873
1869
 
1874
1870
  num_samples = _num_samples(df)
1875
1871
  if num_samples > Dataset.FIT_SAMPLE_THRESHOLD:
@@ -1879,7 +1875,7 @@ class FeaturesEnricher(TransformerMixin):
1879
1875
  tmp_target_name = "__target"
1880
1876
  df = df.rename(columns={TARGET: tmp_target_name})
1881
1877
 
1882
- enriched_Xy, columns_renaming = self.__inner_transform(
1878
+ enriched_Xy, columns_renaming, generated_features = self.__inner_transform(
1883
1879
  trace_id,
1884
1880
  df,
1885
1881
  exclude_features_sources=exclude_features_sources,
@@ -1896,7 +1892,7 @@ class FeaturesEnricher(TransformerMixin):
1896
1892
 
1897
1893
  x_columns = [
1898
1894
  c
1899
- for c in (validated_X.columns.tolist() + self.fit_generated_features + [SYSTEM_RECORD_ID])
1895
+ for c in (validated_X.columns.tolist() + generated_features + [SYSTEM_RECORD_ID])
1900
1896
  if c in enriched_Xy.columns
1901
1897
  ]
1902
1898
 
@@ -1904,7 +1900,7 @@ class FeaturesEnricher(TransformerMixin):
1904
1900
  y_sampled = enriched_Xy[TARGET].copy()
1905
1901
  enriched_X = enriched_Xy.drop(columns=TARGET)
1906
1902
 
1907
- datasets_hash = hash_input(X_sampled, y_sampled, eval_set_sampled_dict)
1903
+ datasets_hash = hash_input(validated_X, validated_y, eval_set)
1908
1904
  self.__cached_sampled_datasets[datasets_hash] = (
1909
1905
  X_sampled,
1910
1906
  y_sampled,
@@ -2023,7 +2019,7 @@ class FeaturesEnricher(TransformerMixin):
2023
2019
  progress_bar: Optional[ProgressBar] = None,
2024
2020
  progress_callback: Optional[Callable[[SearchProgress], Any]] = None,
2025
2021
  add_fit_system_record_id: bool = False,
2026
- ) -> Tuple[pd.DataFrame, Dict[str, str]]:
2022
+ ) -> Tuple[pd.DataFrame, Dict[str, str], List[str]]:
2027
2023
  if self._search_task is None:
2028
2024
  raise NotFittedError(self.bundle.get("transform_unfitted_enricher"))
2029
2025
 
@@ -2036,24 +2032,25 @@ class FeaturesEnricher(TransformerMixin):
2036
2032
 
2037
2033
  if len(self.feature_names_) == 0:
2038
2034
  self.logger.warning(self.bundle.get("no_important_features_for_transform"))
2039
- return X, {c: c for c in X.columns}
2035
+ return X, {c: c for c in X.columns}, []
2040
2036
 
2041
2037
  if self._has_paid_features(exclude_features_sources):
2042
2038
  msg = self.bundle.get("transform_with_paid_features")
2043
2039
  self.logger.warning(msg)
2044
2040
  self.__display_support_link(msg)
2045
- return None, {c: c for c in X.columns}
2041
+ return None, {c: c for c in X.columns}, []
2046
2042
 
2047
2043
  if not metrics_calculation:
2048
2044
  transform_usage = self.rest_client.get_current_transform_usage(trace_id)
2049
2045
  self.logger.info(f"Current transform usage: {transform_usage}. Transforming {len(X)} rows")
2050
2046
  if transform_usage.has_limit:
2051
2047
  if len(X) > transform_usage.rest_rows:
2052
- msg = self.bundle.get("transform_usage_warning").format(len(X), transform_usage.rest_rows)
2048
+ rest_rows = max(transform_usage.rest_rows, 0)
2049
+ msg = self.bundle.get("transform_usage_warning").format(len(X), rest_rows)
2053
2050
  self.logger.warning(msg)
2054
2051
  print(msg)
2055
2052
  show_request_quote_button()
2056
- return None, {c: c for c in X.columns}
2053
+ return None, {c: c for c in X.columns}, []
2057
2054
  else:
2058
2055
  msg = self.bundle.get("transform_usage_info").format(
2059
2056
  transform_usage.limit, transform_usage.transformed_rows
@@ -2093,9 +2090,7 @@ class FeaturesEnricher(TransformerMixin):
2093
2090
  generated_features = []
2094
2091
  date_column = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
2095
2092
  if date_column is not None:
2096
- converter = DateTimeSearchKeyConverter(
2097
- date_column, self.date_format, self.logger, bundle=self.bundle, silent_mode=silent_mode
2098
- )
2093
+ converter = DateTimeSearchKeyConverter(date_column, self.date_format, self.logger, bundle=self.bundle)
2099
2094
  df = converter.convert(df)
2100
2095
  self.logger.info(f"Date column after convertion: {df[date_column]}")
2101
2096
  generated_features.extend(converter.generated_features)
@@ -2110,7 +2105,7 @@ class FeaturesEnricher(TransformerMixin):
2110
2105
  df = generator.generate(df)
2111
2106
  generated_features.extend(generator.generated_features)
2112
2107
 
2113
- normalizer = Normalizer(self.bundle, self.logger, self.warning_counter, silent_mode)
2108
+ normalizer = Normalizer(self.bundle, self.logger)
2114
2109
  df, search_keys, generated_features = normalizer.normalize(df, search_keys, generated_features)
2115
2110
  columns_renaming = normalizer.columns_renaming
2116
2111
 
@@ -2176,7 +2171,7 @@ class FeaturesEnricher(TransformerMixin):
2176
2171
  converter = PostalCodeSearchKeyConverter(postal_code)
2177
2172
  df = converter.convert(df)
2178
2173
 
2179
- generated_features = [f for f in generated_features if f in self.fit_generated_features]
2174
+ # generated_features = [f for f in generated_features if f in self.fit_generated_features]
2180
2175
 
2181
2176
  meaning_types = {col: key.value for col, key in search_keys.items()}
2182
2177
  for col in features_for_transform:
@@ -2216,9 +2211,11 @@ class FeaturesEnricher(TransformerMixin):
2216
2211
 
2217
2212
  df_without_features = df.drop(columns=features_not_to_pass)
2218
2213
 
2219
- df_without_features = clean_full_duplicates(
2220
- df_without_features, self.logger, silent=silent_mode, bundle=self.bundle
2214
+ df_without_features, full_duplicates_warning = clean_full_duplicates(
2215
+ df_without_features, self.logger, bundle=self.bundle
2221
2216
  )
2217
+ if not silent_mode and full_duplicates_warning:
2218
+ self.__log_warning(full_duplicates_warning)
2222
2219
 
2223
2220
  del df
2224
2221
  gc.collect()
@@ -2337,7 +2334,7 @@ class FeaturesEnricher(TransformerMixin):
2337
2334
  if add_fit_system_record_id:
2338
2335
  result = result.rename(columns={SORT_ID: SYSTEM_RECORD_ID})
2339
2336
 
2340
- return result, columns_renaming
2337
+ return result, columns_renaming, generated_features
2341
2338
 
2342
2339
  def _get_excluded_features(self, max_features: Optional[int], importance_threshold: Optional[float]) -> List[str]:
2343
2340
  features_info = self._internal_features_info
@@ -2415,6 +2412,15 @@ class FeaturesEnricher(TransformerMixin):
2415
2412
  def __is_registered(self) -> bool:
2416
2413
  return self.api_key is not None and self.api_key != ""
2417
2414
 
2415
+ def __log_warning(self, message: str, show_support_link: bool = False):
2416
+ warning_num = self.warning_counter.increment()
2417
+ formatted_message = f"WARNING #{warning_num}: {message}\n"
2418
+ if show_support_link:
2419
+ self.__display_support_link(formatted_message)
2420
+ else:
2421
+ print(formatted_message)
2422
+ self.logger.warning(message)
2423
+
2418
2424
  def __inner_fit(
2419
2425
  self,
2420
2426
  trace_id: str,
@@ -2461,9 +2467,7 @@ class FeaturesEnricher(TransformerMixin):
2461
2467
  checked_generate_features = []
2462
2468
  for gen_feature in self.generate_features:
2463
2469
  if gen_feature not in x_columns:
2464
- msg = self.bundle.get("missing_generate_feature").format(gen_feature, x_columns)
2465
- print(msg)
2466
- self.logger.warning(msg)
2470
+ self.__log_warning(self.bundle.get("missing_generate_feature").format(gen_feature, x_columns))
2467
2471
  else:
2468
2472
  checked_generate_features.append(gen_feature)
2469
2473
  self.generate_features = checked_generate_features
@@ -2524,9 +2528,10 @@ class FeaturesEnricher(TransformerMixin):
2524
2528
  self.date_format,
2525
2529
  self.logger,
2526
2530
  bundle=self.bundle,
2527
- warnings_counter=self.warning_counter,
2528
2531
  )
2529
2532
  df = converter.convert(df, keep_time=True)
2533
+ if converter.has_old_dates:
2534
+ self.__log_warning(self.bundle.get("dataset_drop_old_dates"))
2530
2535
  self.logger.info(f"Date column after convertion: {df[maybe_date_column]}")
2531
2536
  self.fit_generated_features.extend(converter.generated_features)
2532
2537
  else:
@@ -2541,7 +2546,9 @@ class FeaturesEnricher(TransformerMixin):
2541
2546
  self.fit_generated_features.extend(generator.generated_features)
2542
2547
 
2543
2548
  # Checks that need validated date
2544
- validate_dates_distribution(df, self.fit_search_keys, self.logger, self.bundle, self.warning_counter)
2549
+
2550
+ if not is_dates_distribution_valid(df, self.fit_search_keys):
2551
+ self.__log_warning(bundle.get("x_unstable_by_date"))
2545
2552
 
2546
2553
  if (
2547
2554
  is_numeric_dtype(df[self.TARGET_NAME])
@@ -2550,18 +2557,25 @@ class FeaturesEnricher(TransformerMixin):
2550
2557
  ):
2551
2558
  self._validate_PSI(df.sort_values(by=maybe_date_column))
2552
2559
 
2553
- normalizer = Normalizer(self.bundle, self.logger, self.warning_counter)
2560
+ normalizer = Normalizer(self.bundle, self.logger)
2554
2561
  df, self.fit_search_keys, self.fit_generated_features = normalizer.normalize(
2555
2562
  df, self.fit_search_keys, self.fit_generated_features
2556
2563
  )
2557
2564
  self.fit_columns_renaming = normalizer.columns_renaming
2565
+ if normalizer.removed_features:
2566
+ self.__log_warning(self.bundle.get("dataset_date_features").format(normalizer.removed_features))
2558
2567
 
2559
2568
  self.__adjust_cv(df)
2560
2569
 
2561
- df = remove_fintech_duplicates(
2570
+ df, fintech_warnings = remove_fintech_duplicates(
2562
2571
  df, self.fit_search_keys, date_format=self.date_format, logger=self.logger, bundle=self.bundle
2563
2572
  )
2564
- df = clean_full_duplicates(df, self.logger, bundle=self.bundle)
2573
+ if fintech_warnings:
2574
+ for fintech_warning in fintech_warnings:
2575
+ self.__log_warning(fintech_warning)
2576
+ df, full_duplicates_warning = clean_full_duplicates(df, self.logger, bundle=self.bundle)
2577
+ if full_duplicates_warning:
2578
+ self.__log_warning(full_duplicates_warning)
2565
2579
 
2566
2580
  # Explode multiple search keys
2567
2581
  df = self.__add_fit_system_record_id(df, self.fit_search_keys, ENTITY_SYSTEM_RECORD_ID)
@@ -2621,9 +2635,12 @@ class FeaturesEnricher(TransformerMixin):
2621
2635
 
2622
2636
  features_columns = [c for c in df.columns if c not in non_feature_columns]
2623
2637
 
2624
- features_to_drop = FeaturesValidator(self.logger).validate(
2625
- df, features_columns, self.generate_features, self.warning_counter, self.fit_columns_renaming
2638
+ features_to_drop, feature_validator_warnings = FeaturesValidator(self.logger).validate(
2639
+ df, features_columns, self.generate_features, self.fit_columns_renaming
2626
2640
  )
2641
+ if feature_validator_warnings:
2642
+ for warning in feature_validator_warnings:
2643
+ self.__log_warning(warning)
2627
2644
  self.fit_dropped_features.update(features_to_drop)
2628
2645
  df = df.drop(columns=features_to_drop)
2629
2646
 
@@ -2739,9 +2756,7 @@ class FeaturesEnricher(TransformerMixin):
2739
2756
  zero_hit_columns = self.get_columns_by_search_keys(zero_hit_search_keys)
2740
2757
  if zero_hit_columns:
2741
2758
  msg = self.bundle.get("features_info_zero_hit_rate_search_keys").format(zero_hit_columns)
2742
- self.logger.warning(msg)
2743
- self.__display_support_link(msg)
2744
- self.warning_counter.increment()
2759
+ self.__log_warning(msg, show_support_link=True)
2745
2760
 
2746
2761
  if (
2747
2762
  self._search_task.unused_features_for_generation is not None
@@ -2751,9 +2766,7 @@ class FeaturesEnricher(TransformerMixin):
2751
2766
  dataset.columns_renaming.get(col) or col for col in self._search_task.unused_features_for_generation
2752
2767
  ]
2753
2768
  msg = self.bundle.get("features_not_generated").format(unused_features_for_generation)
2754
- self.logger.warning(msg)
2755
- print(msg)
2756
- self.warning_counter.increment()
2769
+ self.__log_warning(msg)
2757
2770
 
2758
2771
  self.__prepare_feature_importances(trace_id, validated_X.columns.to_list() + self.fit_generated_features)
2759
2772
 
@@ -3154,7 +3167,7 @@ class FeaturesEnricher(TransformerMixin):
3154
3167
  maybe_date_col = SearchKey.find_key(self.search_keys, [SearchKey.DATE, SearchKey.DATETIME])
3155
3168
  if X is not None and maybe_date_col is not None and maybe_date_col in X.columns:
3156
3169
  # TODO cast date column to single dtype
3157
- date_converter = DateTimeSearchKeyConverter(maybe_date_col, self.date_format, silent_mode=True)
3170
+ date_converter = DateTimeSearchKeyConverter(maybe_date_col, self.date_format)
3158
3171
  converted_X = date_converter.convert(X)
3159
3172
  min_date = converted_X[maybe_date_col].min()
3160
3173
  max_date = converted_X[maybe_date_col].max()
@@ -3196,7 +3209,7 @@ class FeaturesEnricher(TransformerMixin):
3196
3209
  logger.warning(msg)
3197
3210
  df[FeaturesEnricher.CURRENT_DATE] = datetime.date.today()
3198
3211
  search_keys[FeaturesEnricher.CURRENT_DATE] = SearchKey.DATE
3199
- converter = DateTimeSearchKeyConverter(FeaturesEnricher.CURRENT_DATE, silent_mode=True)
3212
+ converter = DateTimeSearchKeyConverter(FeaturesEnricher.CURRENT_DATE)
3200
3213
  df = converter.convert(df)
3201
3214
  return df
3202
3215
 
@@ -3768,15 +3781,15 @@ class FeaturesEnricher(TransformerMixin):
3768
3781
  if meaning_type == SearchKey.COUNTRY and self.country_code is not None:
3769
3782
  msg = self.bundle.get("search_key_country_and_country_code")
3770
3783
  self.logger.warning(msg)
3771
- print(msg)
3784
+ if not silent_mode:
3785
+ self.__log_warning(msg)
3772
3786
  self.country_code = None
3773
3787
 
3774
3788
  if not self.__is_registered and not is_demo_dataset and meaning_type in SearchKey.personal_keys():
3775
3789
  msg = self.bundle.get("unregistered_with_personal_keys").format(meaning_type)
3776
3790
  self.logger.warning(msg)
3777
3791
  if not silent_mode:
3778
- self.warning_counter.increment()
3779
- print(msg)
3792
+ self.__log_warning(msg)
3780
3793
 
3781
3794
  valid_search_keys[column_name] = SearchKey.CUSTOM_KEY
3782
3795
  else:
@@ -3810,27 +3823,22 @@ class FeaturesEnricher(TransformerMixin):
3810
3823
  and not silent_mode
3811
3824
  ):
3812
3825
  msg = self.bundle.get("date_only_search")
3813
- print(msg)
3814
- self.logger.warning(msg)
3815
- self.warning_counter.increment()
3826
+ self.__log_warning(msg)
3816
3827
 
3817
3828
  maybe_date = [k for k, v in valid_search_keys.items() if v in [SearchKey.DATE, SearchKey.DATETIME]]
3818
3829
  if (self.cv is None or self.cv == CVType.k_fold) and len(maybe_date) > 0 and not silent_mode:
3819
3830
  date_column = next(iter(maybe_date))
3820
3831
  if x[date_column].nunique() > 0.9 * _num_samples(x):
3821
3832
  msg = self.bundle.get("date_search_without_time_series")
3822
- print(msg)
3823
- self.logger.warning(msg)
3824
- self.warning_counter.increment()
3833
+ self.__log_warning(msg)
3825
3834
 
3826
3835
  if len(valid_search_keys) == 1:
3827
3836
  key, value = list(valid_search_keys.items())[0]
3828
3837
  # Show warning for country only if country is the only key
3829
3838
  if x[key].nunique() == 1:
3830
3839
  msg = self.bundle.get("single_constant_search_key").format(value, x[key].values[0])
3831
- print(msg)
3832
- self.logger.warning(msg)
3833
- self.warning_counter.increment()
3840
+ if not silent_mode:
3841
+ self.__log_warning(msg)
3834
3842
  # TODO maybe raise ValidationError
3835
3843
 
3836
3844
  self.logger.info(f"Prepared search keys: {valid_search_keys}")
@@ -3890,9 +3898,7 @@ class FeaturesEnricher(TransformerMixin):
3890
3898
  )
3891
3899
  else:
3892
3900
  msg = self.bundle.get("features_info_zero_important_features")
3893
- self.logger.warning(msg)
3894
- self.__display_support_link(msg)
3895
- self.warning_counter.increment()
3901
+ self.__log_warning(msg, show_support_link=True)
3896
3902
  except (ImportError, NameError):
3897
3903
  print(msg)
3898
3904
  print(self._internal_features_info)
@@ -3994,8 +4000,7 @@ class FeaturesEnricher(TransformerMixin):
3994
4000
  " But not used because not registered user"
3995
4001
  )
3996
4002
  if not silent_mode:
3997
- print(self.bundle.get("email_detected_not_registered").format(maybe_keys))
3998
- self.warning_counter.increment()
4003
+ self.__log_warning(self.bundle.get("email_detected_not_registered").format(maybe_keys))
3999
4004
 
4000
4005
  # if SearchKey.PHONE not in search_keys.values() and check_need_detect(SearchKey.PHONE):
4001
4006
  if check_need_detect(SearchKey.PHONE):
@@ -4014,8 +4019,7 @@ class FeaturesEnricher(TransformerMixin):
4014
4019
  "But not used because not registered user"
4015
4020
  )
4016
4021
  if not silent_mode:
4017
- print(self.bundle.get("phone_detected_not_registered"))
4018
- self.warning_counter.increment()
4022
+ self.__log_warning(self.bundle.get("phone_detected_not_registered"))
4019
4023
 
4020
4024
  return search_keys
4021
4025
 
@@ -4039,19 +4043,13 @@ class FeaturesEnricher(TransformerMixin):
4039
4043
  part2 = train[half_train:]
4040
4044
  train_psi = calculate_psi(part1[self.TARGET_NAME], part2[self.TARGET_NAME])
4041
4045
  if train_psi > 0.2:
4042
- self.warning_counter.increment()
4043
- msg = self.bundle.get("train_unstable_target").format(train_psi)
4044
- print(msg)
4045
- self.logger.warning(msg)
4046
+ self.__log_warning(self.bundle.get("train_unstable_target").format(train_psi))
4046
4047
 
4047
4048
  # 2. Check train-test PSI
4048
4049
  if eval1 is not None:
4049
4050
  train_test_psi = calculate_psi(train[self.TARGET_NAME], eval1[self.TARGET_NAME])
4050
4051
  if train_test_psi > 0.2:
4051
- self.warning_counter.increment()
4052
- msg = self.bundle.get("eval_unstable_target").format(train_test_psi)
4053
- print(msg)
4054
- self.logger.warning(msg)
4052
+ self.__log_warning(self.bundle.get("eval_unstable_target").format(train_test_psi))
4055
4053
 
4056
4054
  def _dump_python_libs(self):
4057
4055
  try:
@@ -4073,8 +4071,8 @@ class FeaturesEnricher(TransformerMixin):
4073
4071
  self.logger.warning(f"Showing support link: {link_text}")
4074
4072
  display(
4075
4073
  HTML(
4076
- f"""<br/>{link_text} <a href='{support_link}' target='_blank' rel='noopener noreferrer'>
4077
- here</a>"""
4074
+ f"""{link_text} <a href='{support_link}' target='_blank' rel='noopener noreferrer'>
4075
+ here</a><br/>"""
4078
4076
  )
4079
4077
  )
4080
4078
  except (ImportError, NameError):
upgini/metrics.py CHANGED
@@ -745,20 +745,25 @@ class OtherEstimatorWrapper(EstimatorWrapper):
745
745
 
746
746
 
747
747
  def validate_scoring_argument(scoring: Union[Callable, str, None]):
748
- if isinstance(scoring, str) and scoring is not None:
748
+ if scoring is None:
749
+ return
750
+
751
+ if isinstance(scoring, str):
749
752
  _get_scorer_by_name(scoring)
750
- elif isinstance(scoring, Callable):
751
- spec = inspect.getfullargspec(scoring)
752
- if len(spec.args) < 3:
753
- raise ValidationError(
754
- f"Invalid scoring function passed {scoring}. It should accept 3 input arguments: estimator, x, y"
755
- )
756
- elif scoring is not None:
753
+ return
754
+
755
+ if not isinstance(scoring, Callable):
757
756
  raise ValidationError(
758
757
  f"Invalid scoring argument passed {scoring}. It should be string with scoring name or function"
759
758
  " that accepts 3 input arguments: estimator, x, y"
760
759
  )
761
760
 
761
+ spec = inspect.getfullargspec(scoring)
762
+ if len(spec.args) < 3:
763
+ raise ValidationError(
764
+ f"Invalid scoring function passed {scoring}. It should accept 3 input arguments: estimator, x, y"
765
+ )
766
+
762
767
 
763
768
  def _get_scorer_by_name(scoring: str) -> Tuple[Callable, str, int]:
764
769
  metric_name = scoring
@@ -26,7 +26,6 @@ from upgini.resource_bundle import ResourceBundle, get_custom_bundle
26
26
  from upgini.utils import find_numbers_with_decimal_comma
27
27
  from upgini.utils.datetime_utils import DateTimeSearchKeyConverter
28
28
  from upgini.utils.phone_utils import PhoneSearchKeyConverter
29
- from upgini.utils.warning_counter import WarningCounter
30
29
 
31
30
 
32
31
  class Normalizer:
@@ -37,16 +36,13 @@ class Normalizer:
37
36
  self,
38
37
  bundle: ResourceBundle = None,
39
38
  logger: Logger = None,
40
- warnings_counter: WarningCounter = None,
41
- silent_mode=False,
42
39
  ):
43
40
  self.bundle = bundle or get_custom_bundle()
44
41
  self.logger = logger or getLogger()
45
- self.warnings_counter = warnings_counter or WarningCounter()
46
- self.silent_mode = silent_mode
47
42
  self.columns_renaming = {}
48
43
  self.search_keys = {}
49
44
  self.generated_features = []
45
+ self.removed_features = []
50
46
 
51
47
  def normalize(
52
48
  self, df: pd.DataFrame, search_keys: Dict[str, SearchKey], generated_features: List[str]
@@ -139,19 +135,11 @@ class Normalizer:
139
135
  def _remove_dates_from_features(self, df: pd.DataFrame):
140
136
  features = self._get_features(df)
141
137
 
142
- removed_features = []
143
138
  for f in features:
144
139
  if is_datetime(df[f]) or isinstance(df[f].dtype, pd.PeriodDtype):
145
- removed_features.append(f)
140
+ self.removed_features.append(f)
146
141
  df.drop(columns=f, inplace=True)
147
142
 
148
- if removed_features:
149
- msg = self.bundle.get("dataset_date_features").format(removed_features)
150
- self.logger.warning(msg)
151
- if not self.silent_mode:
152
- print(msg)
153
- self.warnings_counter.increment()
154
-
155
143
  return df
156
144
 
157
145
  def _cut_too_long_string_values(self, df: pd.DataFrame):
@@ -15,31 +15,28 @@ transform_usage_warning=You are trying to launch enrichment for {} rows, which w
15
15
 
16
16
  # Warnings
17
17
  support_link=https://upgini.com/support
18
- # slack_community_link=https://4mlg.short.gy/join-upgini-community
19
- # slack_community_text=\nWARNING: Looks like you've run into an error. For help request write us in the Upgini community
20
- support_text=\nWARNING: Looks like you've run into an error. For help request write us in support
18
+ support_text=Looks like you've run into an error. For help request write us in support
21
19
  slack_community_bage=https://img.shields.io/badge/slack-@upgini-orange.svg?logo=slack
22
20
  slack_community_alt=Upgini Slack community
23
- version_warning=\nWARNING: Unsupported library version detected {},\nplease update with “%pip install -U upgini” to the latest {} and restart Jupyter kernel
24
- unregistered_with_personal_keys=\nWARNING: Search key {} can be used only with personal api_key from profile.upgini.com It will be ignored
25
- date_only_search=\nWARNING: Search started with DATE search key only\nTry to add other keys like the COUNTRY, POSTAL_CODE, PHONE NUMBER, EMAIL/HEM, IP to your training dataset\nfor search through all the available data sources.\nSee docs https://github.com/upgini/upgini#-total-239-countries-and-up-to-41-years-of-history
26
- date_search_without_time_series=\nWARNING: Looks like your training dataset is a time series. We recommend to set `cv=CVType.time_series` param for correct search results.\nSee docs https://github.com/upgini/upgini#-time-series-prediction-support
27
- metrics_exclude_paid_features=\nWARNING: Metrics calculated after enrichment has a free features only. To calculate metrics with a full set of relevant features, including commercial data sources, please contact support team:
28
- metrics_no_important_free_features=\nWARNING: No important free features to calculate metrics
29
- metrics_no_important_features=\nWARNING: No important features to calculate metrics
21
+ version_warning=Unsupported library version detected {},\nplease update with “%pip install -U upgini” to the latest {} and restart Jupyter kernel
22
+ unregistered_with_personal_keys=Search key {} can be used only with personal api_key from profile.upgini.com It will be ignored
23
+ date_only_search=Search started with DATE search key only\nTry to add other keys like the COUNTRY, POSTAL_CODE, PHONE NUMBER, EMAIL/HEM, IP to your training dataset\nfor search through all the available data sources.\nSee docs https://github.com/upgini/upgini#-total-239-countries-and-up-to-41-years-of-history
24
+ date_search_without_time_series=Looks like your training dataset is a time series. We recommend to set `cv=CVType.time_series` param for correct search results.\nSee docs https://github.com/upgini/upgini#-time-series-prediction-support
25
+ metrics_exclude_paid_features=Metrics calculated after enrichment has a free features only. To calculate metrics with a full set of relevant features, including commercial data sources, please contact support team:
26
+ metrics_no_important_free_features=No important free features to calculate metrics
27
+ metrics_no_important_features=No important features to calculate metrics
30
28
  metrics_negative_uplift_without_cv=Please re-check that your task is not a time series prediction. If so, restart search with cv=CVType.time_series param for correct search results. See docs https://github.com/upgini/upgini#-time-series-prediction-support
31
29
  # metrics_with_trial_features=The calculation of final accuracy metrics using Trial data is not available for unauthorized users.\nGet a free API key on https://upgini.com and repeat your request.
32
- # transform_with_trial_features=\nWARNING: Your search results contain Trial data sources. To enrich your dataframe using transform or fit_transform with features from these Trial data sources, please register for a Free API key at https://upgini.com and resubmit your request.
30
+ # transform_with_trial_features=Your search results contain Trial data sources. To enrich your dataframe using transform or fit_transform with features from these Trial data sources, please register for a Free API key at https://upgini.com and resubmit your request.
33
31
  # Enriching with Trial data is not available for unauthorized users.\nGet a free API key on https://upgini.com and repeat your request.
34
- metrics_with_paid_features=\nWARNING: The calculation of final accuracy metrics using Paid data is not available.\nContact Upgini support for the data access
35
- transform_with_paid_features=\nWARNING: Enriching with Paid data is not available.\nContact Upgini support for the data access
36
- trial_quota_limit_riched=\nWARNING: You have reached the quota limit of trial data usage. Please contact Upgini support to remove restriction
37
- loss_selection_warn=\nWARNING: Loss `{0}` is not supported for feature selection with {1}
38
- loss_calc_metrics_warn=\nWARNING: Loss `{0}` is not supported for metrics calculation with {1}
39
- multivariate_timeseries_detected=\nWARNING: Multivariate TimeSeries detected. Blocked time series cross-validation split selected.\nMore details: https://github.com/upgini/upgini#-time-series-prediction-support
40
- group_k_fold_in_classification=\nWARNING: Using group K-fold cross-validation split for classification task.
41
- current_date_added=\nWARNING: No date/datetime column was detected in X to be used as a search key. The current date will be used to match the latest version of data sources
42
-
32
+ metrics_with_paid_features=The calculation of final accuracy metrics using Paid data is not available.\nContact Upgini support for the data access
33
+ transform_with_paid_features=Enriching with Paid data is not available.\nContact Upgini support for the data access
34
+ trial_quota_limit_riched=You have reached the quota limit of trial data usage. Please contact Upgini support to remove restriction
35
+ loss_selection_warn=Loss `{0}` is not supported for feature selection with {1}
36
+ loss_calc_metrics_warn=Loss `{0}` is not supported for metrics calculation with {1}
37
+ multivariate_timeseries_detected=Multivariate TimeSeries detected. Blocked time series cross-validation split selected.\nMore details: https://github.com/upgini/upgini#-time-series-prediction-support
38
+ group_k_fold_in_classification=Using group K-fold cross-validation split for classification task.
39
+ current_date_added=No date/datetime column was detected in X to be used as a search key. The current date will be used to match the latest version of data sources
43
40
  # Errors
44
41
  failed_search_by_task_id=Failed to retrieve the specified search results
45
42
  metrics_unfitted_enricher=Call fit method before calling calculate_metrics
@@ -86,11 +83,11 @@ search_key_not_found=Column `{}` from search_keys was not found in X dataframe:
86
83
  numeric_search_key_not_found=Index {} in search_keys is out of bounds for {} columns of X dataframe
87
84
  unsupported_search_key_type=Unsupported type of key in search_keys: {}
88
85
  unsupported_type_of_search_key=Unsupported type of search key: {}. It should be a member of SearcKey
89
- search_key_country_and_country_code=\nWARNING: SearchKey.COUNTRY and country_code parameter were passed simultaniously. Parameter country_code will be ignored
86
+ search_key_country_and_country_code=SearchKey.COUNTRY and country_code parameter were passed simultaniously. Parameter country_code will be ignored
90
87
  empty_search_key=Search key {} is empty. Please fill values or remove this search key
91
- single_constant_search_key=\nWARNING: Constant value detected for the {} search key in the X dataframe: {}.\nThat search key will add constant features for different y values.\nPlease add extra search keys with non constant values, like the COUNTRY, POSTAL_CODE, DATE, PHONE NUMBER, EMAIL/HEM or IPv4
88
+ single_constant_search_key=Constant value detected for the {} search key in the X dataframe: {}.\nThat search key will add constant features for different y values.\nPlease add extra search keys with non constant values, like the COUNTRY, POSTAL_CODE, DATE, PHONE NUMBER, EMAIL/HEM or IPv4
92
89
  unsupported_multi_key=Search key {} cannot be used multiple times
93
- unsupported_index_column=\nWARNING: Your column with name `index` was dropped because it's reserved name is booked for system needs.
90
+ unsupported_index_column=Your column with name `index` was dropped because it's reserved name is booked for system needs.
94
91
  date_string_without_format=Date column `{}` has string type, but date_format is not specified. Convert column to datetime type or pass date_format
95
92
  invalid_date_format=Failed to parse date in column `{}`. Try to pass explicit date format in date_format argument of FeaturesEnricher constructor
96
93
  unsupported_date_type=Unsupported type of date column `{}`. Convert to datetime please.
@@ -100,7 +97,7 @@ invalid_ip=All values of IP column `{}` are invalid
100
97
  # X and y validation
101
98
  unsupported_x_type=Unsupported type of X: {}. Use pandas.DataFrame, pandas.Series or numpy.ndarray or list
102
99
  x_contains_dup_columns=X contains duplicate column names. Please rename or drop duplicates
103
- x_contains_enriching_columns=\nWARNING: X contains column names that match the names of features from external data sources. They will be dropped from the dataframe before the enrichment: {}
100
+ x_contains_enriching_columns=X contains column names that match the names of features from external data sources. They will be dropped from the dataframe before the enrichment: {}
104
101
  unsupported_y_type=Unsupported type of y: {}. Use pandas.DataFrame, pandas.Series, numpy.ndarray or list
105
102
  y_is_constant=y is a constant. Relevant feature search requires a non-constant y
106
103
  x_and_y_diff_size=X and y has different size: {}, {}.
@@ -113,10 +110,10 @@ y_multiindex_unsupported=Multi index in y is not supported
113
110
  x_is_empty=X is empty
114
111
  y_is_empty=y is empty
115
112
  x_contains_reserved_column_name=Column name {} is reserved. Please rename column and try again
116
- missing_generate_feature=\nWARNING: Feature {} specified in `generate_features` is not present in input columns: {}
117
- x_unstable_by_date=\nWARNING: Your training sample is unstable in number of rows per date. It is recommended to redesign the training sample
118
- train_unstable_target=\nWARNING: Your training sample contains an unstable target event, PSI = {}. This will lead to unstable scoring on deferred samples. It is recommended to redesign the training sample
119
- eval_unstable_target=\nWARNING: Your training and evaluation samples have a difference in target distribution. PSI = {}. The results will be unstable. It is recommended to redesign the training and evaluation samples
113
+ missing_generate_feature=Feature {} specified in `generate_features` is not present in input columns: {}
114
+ x_unstable_by_date=Your training sample is unstable in number of rows per date. It is recommended to redesign the training sample
115
+ train_unstable_target=Your training sample contains an unstable target event, PSI = {}. This will lead to unstable scoring on deferred samples. It is recommended to redesign the training sample
116
+ eval_unstable_target=Your training and evaluation samples have a difference in target distribution. PSI = {}. The results will be unstable. It is recommended to redesign the training and evaluation samples
120
117
  # eval set validation
121
118
  unsupported_type_eval_set=Unsupported type of eval_set: {}. It should be list of tuples with two elements: X and y
122
119
  eval_set_invalid_tuple_size=eval_set contains a tuple of size {}. It should contain only pairs of X and y
@@ -140,21 +137,23 @@ missing_features_for_transform=Missing some features for transform that were pre
140
137
  # target validation
141
138
  empty_target=Target is empty in all rows
142
139
  # non_numeric_target=Binary target should be numerical type
143
- uneven_eval_target_distribution=\nWARNING: y distributions from the training sample and eval_set differ according to the Kolmogorov-Smirnov test,\nwhich makes metrics between the train and eval_set incomparable.
144
- target_outliers_warning=\nWARNING: We detected {} outliers in your sample.\nExamples of outliers with maximum value of target:\n{}\nOutliers will {}be excluded during the metrics calculation.
140
+ uneven_eval_target_distribution=y distributions from the training sample and eval_set differ according to the Kolmogorov-Smirnov test,\nwhich makes metrics between the train and eval_set incomparable.
141
+ target_outliers_warning=We detected {} outliers in your sample.\nExamples of outliers with maximum value of target:\n{}\nOutliers will {}be excluded during the metrics calculation.
142
+
145
143
  # features validation
146
- empty_or_contant_features=\nWARNING: Columns {} has value with frequency more than 99%, removed from X
147
- high_cardinality_features=\nWARNING: Columns {} has high cardinality (>90% unique values), removed from X
148
- # one_hot_encoded_features=\nWARNING: One hot encoded features detected. Use int encoding for correct results of fit.\n{}
144
+ empty_or_contant_features=Columns {} has value with frequency more than 99%, removed from X
145
+ high_cardinality_features=Columns {} has high cardinality (>90% unique values), removed from X
146
+ # one_hot_encoded_features=One hot encoded features detected. Use int encoding for correct results of fit.\n{}
147
+
149
148
  # Dataset validation
150
149
  dataset_too_few_rows=X size should be at least {} rows after validation
151
150
  dataset_too_many_rows_registered=X rows limit for transform is {}. Please sample X
152
151
  dataset_empty_column_names=Some column names are empty. Add names please
153
- dataset_full_duplicates=\nWARNING: {:.5f}% of the rows are fully duplicated
154
- dataset_diff_target_duplicates=\nWARNING: {:.4f}% of rows ({}) in X and eval_set are duplicates with different y values. These rows will be deleted as incorrect\nIncorrect row indexes: {}
155
- dataset_train_diff_target_duplicates_fintech=\nWARNING: {:.4f}% of rows ({}) in X are duplicates, not taking into consideration dates, IP addresses and features from the training set, but have different y values. These rows have been removed to optimize search results.\nRemoved row indexes: {}
156
- dataset_eval_diff_target_duplicates_fintech=\nWARNING: {:.4f}% of rows ({}) in eval{} X are duplicates, not taking into consideration dates, IP addresses and features from the training set, but have different y values. These rows have been removed to optimize search results.\nRemoved row indexes: {}
157
- dataset_drop_old_dates=\nWARNING: We don't have data before '2000-01-01' and removed all earlier records from the search dataset
152
+ dataset_full_duplicates={:.5f}% of the rows are fully duplicated
153
+ dataset_diff_target_duplicates={:.4f}% of rows ({}) in X and eval_set are duplicates with different y values. These rows will be deleted as incorrect\nIncorrect row indexes: {}
154
+ dataset_train_diff_target_duplicates_fintech={:.4f}% of rows ({}) in X are duplicates, not taking into consideration dates, IP addresses and features from the training set, but have different y values. These rows have been removed to optimize search results.\nRemoved row indexes: {}
155
+ dataset_eval_diff_target_duplicates_fintech={:.4f}% of rows ({}) in eval{} X are duplicates, not taking into consideration dates, IP addresses and features from the training set, but have different y values. These rows have been removed to optimize search results.\nRemoved row indexes: {}
156
+ dataset_drop_old_dates=We don't have data before '2000-01-01' and removed all earlier records from the search dataset
158
157
  dataset_all_dates_old=There is empty train dataset after removing data before '2000-01-01'
159
158
  dataset_invalid_target_type=Unexpected dtype of target for binary task type: {}. Expected int or bool
160
159
  dataset_invalid_binary_target=Binary task type should contain only 2 target values, but {} found
@@ -163,8 +162,8 @@ dataset_invalid_regression_target=Unexpected dtype of target for regression task
163
162
  dataset_invalid_timeseries_target=Unexpected dtype of target for timeseries task type: {}. Expected float
164
163
  dataset_to_many_multiclass_targets=The number of target classes {} exceeds the allowed threshold: {}. Please, correct your data and try again
165
164
  dataset_rarest_class_less_min=Count of rows with the rarest class `{}` is {}, minimum count must be > {} for each class\nPlease, remove rows with rarest class from your dataframe
166
- dataset_rarest_class_less_threshold=\nWARNING: Target is imbalanced and will be undersampled to the rarest class. Frequency of the rarest class `{}` is {}\nMinimum number of observations for each class to avoid undersampling {} ({}%)
167
- dataset_date_features=\nWARNING: Columns {} is a datetime or period type but not used as a search key, removed from X
165
+ dataset_rarest_class_less_threshold=Target is imbalanced and will be undersampled to the rarest class. Frequency of the rarest class `{}` is {}\nMinimum number of observations for each class to avoid undersampling {} ({}%)
166
+ dataset_date_features=Columns {} is a datetime or period type but not used as a search key, removed from X
168
167
  dataset_too_many_features=Too many features. Maximum number of features is {}
169
168
  dataset_constant_target=y contains only one distinct value
170
169
  dataset_empty_target=y contains only NaN or incorrect values.
@@ -172,10 +171,9 @@ dataset_invalid_column_type=Unsupported data type of column {}: {}
172
171
  dataset_invalid_filter=Unknown field in filter_features. Should be {'min_importance', 'max_psi', 'max_count', 'selected_features'}.
173
172
  dataset_too_big_file=Too big size of dataframe X for processing. Please reduce number of rows or columns
174
173
  dataset_transform_diff_fit=You try to enrich dataset that column names are different from the train dataset column names that you used on the fit stage. Please make the column names the same as in the train dataset and restart.
175
- binary_small_dataset=\nWARNING: The least populated class in Target contains less than 1000 rows.\nSmall numbers of observations may negatively affect the number of selected features and quality of your ML model.\nUpgini recommends you increase the number of observations in the least populated class.
174
+ binary_small_dataset=The least populated class in Target contains less than 1000 rows.\nSmall numbers of observations may negatively affect the number of selected features and quality of your ML model.\nUpgini recommends you increase the number of observations in the least populated class.
176
175
  all_search_keys_invalid=All search keys are invalid
177
- all_emails_invalid=\nWARNING: All values in column {} are invalid emails
178
- # Metrics validation
176
+ all_emails_invalid=All values in column {} are invalid emails # Metrics validation
179
177
  metrics_msle_negative_target=Mean Squared Logarithmic Error cannot be used when y contain negative values
180
178
  metrics_unsupported_target_type=Unsupported type of target in y: {}
181
179
  metrics_invalid_scoring={} is not a valid scoring value. Use {} to get valid options
@@ -193,8 +191,7 @@ ads_upload_to_many_empty_rows=More than 50% of rows in the submitted sample does
193
191
  # Features info warning
194
192
  features_info_zero_important_features=Oops, we can't find any relevant external features for your training dataset,\nmost probably due to issues with search keys formats.\nPlease check docs https://github.com/upgini/upgini#-search-key-types-we-support-more-to-come or send us a help request in Support:
195
193
  features_info_zero_hit_rate_search_keys=Oops, looks like values/formats of the search keys {} might be incorrect,\nas we won't be able to match any data source using these values\nPlease check docs https://github.com/upgini/upgini#-search-key-types-we-support-more-to-come or send us a help request in Support:
196
- features_not_generated=\nWARNING: Following features didn't pass checks for automated feature generation: {}
197
-
194
+ features_not_generated=Following features didn't pass checks for automated feature generation: {}
198
195
  # Information
199
196
  postal_code_detected=Postal codes detected in column `{}`. It will be used as a search key\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns
200
197
  country_detected=Countries detected in column `{}`. It will be used as a search key\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns
@@ -203,8 +200,8 @@ country_default_determined=Search key country_code `{}` was used as default. \nS
203
200
  email_detected=Emails detected in column `{}`. It will be used as a search key\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns
204
201
  email_detected_not_registered=Emails detected in column `{}`. It can be used only with api_key from profile.upgini.com\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns
205
202
  phone_detected=Phone numbers detected in column `{}`. It can be used only with api_key from profile.upgini.com\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns
206
- phone_detected_not_registered=\nWARNING: Phone numbers detected in column `{}`. It can be used only with api_key from profile.upgini.com\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns
207
- target_type_detected=\nDetected task type: {}. Reason: {}\nYou can set task type manually with argument `model_task_type` of FeaturesEnricher constructor if task type detected incorrectly\n
203
+ phone_detected_not_registered=Phone numbers detected in column `{}`. It can be used only with api_key from profile.upgini.com\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns
204
+ target_type_detected=Detected task type: {}. Reason: {}\nYou can set task type manually with argument `model_task_type` of FeaturesEnricher constructor if task type detected incorrectly\n
208
205
  binary_target_reason=only two unique label-values observed
209
206
  non_numeric_multiclass_reason=non-numeric label values observed
210
207
  few_unique_label_multiclass_reason=few unique label-values observed and can be considered as categorical
@@ -215,7 +212,7 @@ limited_int_multiclass_reason=integer-like values with limited unique values obs
215
212
  all_ok_community_invite=❓ Support request
216
213
  too_small_for_metrics=Your train dataset or one of eval datasets contains less than 500 rows. For such dataset Upgini will not calculate accuracy metrics. Please increase the number of rows in the training dataset to calculate accuracy metrics
217
214
  imbalance_multiclass=Class {0} is on 25% quantile of classes distribution ({1} records in train dataset). \nDownsample classes with records more than {1}.
218
- imbalanced_target=\nWARNING: Target is imbalanced and will be undersampled. Frequency of the rarest class `{}` is {}
215
+ imbalanced_target=Target is imbalanced and will be undersampled. Frequency of the rarest class `{}` is {}
219
216
  loss_selection_info=Using loss `{}` for feature selection
220
217
  loss_calc_metrics_info=Using loss `{}` for metrics calculation with default estimator
221
218
 
@@ -11,7 +11,6 @@ from pandas.api.types import is_numeric_dtype
11
11
  from upgini.errors import ValidationError
12
12
  from upgini.metadata import EVAL_SET_INDEX, SearchKey
13
13
  from upgini.resource_bundle import ResourceBundle, get_custom_bundle
14
- from upgini.utils.warning_counter import WarningCounter
15
14
 
16
15
  DATE_FORMATS = [
17
16
  "%Y-%m-%d",
@@ -42,8 +41,6 @@ class DateTimeSearchKeyConverter:
42
41
  date_format: Optional[str] = None,
43
42
  logger: Optional[logging.Logger] = None,
44
43
  bundle: Optional[ResourceBundle] = None,
45
- warnings_counter: Optional[WarningCounter] = None,
46
- silent_mode=False,
47
44
  ):
48
45
  self.date_column = date_column
49
46
  self.date_format = date_format
@@ -54,8 +51,7 @@ class DateTimeSearchKeyConverter:
54
51
  self.logger.setLevel("FATAL")
55
52
  self.generated_features: List[str] = []
56
53
  self.bundle = bundle or get_custom_bundle()
57
- self.warnings_counter = warnings_counter or WarningCounter()
58
- self.silent_mode = silent_mode
54
+ self.has_old_dates = False
59
55
 
60
56
  @staticmethod
61
57
  def _int_to_opt(i: int) -> Optional[int]:
@@ -101,7 +97,6 @@ class DateTimeSearchKeyConverter:
101
97
  df[self.date_column] = pd.to_datetime(df[self.date_column], unit="s")
102
98
  else:
103
99
  msg = self.bundle.get("unsupported_date_type").format(self.date_column)
104
- self.logger.warning(msg)
105
100
  raise ValidationError(msg)
106
101
  else:
107
102
  df[self.date_column] = df[self.date_column].astype("string").apply(self.clean_date)
@@ -162,13 +157,9 @@ class DateTimeSearchKeyConverter:
162
157
  condition = df[self.date_column] <= self.MIN_SUPPORTED_DATE_TS
163
158
  old_subset = df[condition]
164
159
  if len(old_subset) > 0:
160
+ self.has_old_dates = True
165
161
  df.loc[condition, self.date_column] = None
166
162
  self.logger.info(f"Set to None: {len(old_subset)} of {len(df)} rows because they are before 2000-01-01")
167
- msg = self.bundle.get("dataset_drop_old_dates")
168
- self.logger.warning(msg)
169
- if not self.silent_mode:
170
- print(msg)
171
- self.warnings_counter.increment()
172
163
  return df
173
164
 
174
165
 
@@ -256,13 +247,10 @@ def is_blocked_time_series(df: pd.DataFrame, date_col: str, search_keys: List[st
256
247
  return len(accumulated_changing_columns) <= 2
257
248
 
258
249
 
259
- def validate_dates_distribution(
250
+ def is_dates_distribution_valid(
260
251
  df: pd.DataFrame,
261
252
  search_keys: Dict[str, SearchKey],
262
- logger: Optional[logging.Logger] = None,
263
- bundle: Optional[ResourceBundle] = None,
264
- warning_counter: Optional[WarningCounter] = None,
265
- ):
253
+ ) -> bool:
266
254
  maybe_date_col = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
267
255
 
268
256
  if EVAL_SET_INDEX in df.columns:
@@ -303,13 +291,4 @@ def validate_dates_distribution(
303
291
  date_counts_2 = date_counts[round(len(date_counts) / 2) :]
304
292
  ratio = date_counts_2.mean() / date_counts_1.mean()
305
293
 
306
- if ratio > 1.2 or ratio < 0.8:
307
- if warning_counter is not None:
308
- warning_counter.increment()
309
- if logger is None:
310
- logger = logging.getLogger("muted_logger")
311
- logger.setLevel("FATAL")
312
- bundle = bundle or get_custom_bundle()
313
- msg = bundle.get("x_unstable_by_date")
314
- print(msg)
315
- logger.warning(msg)
294
+ return ratio >= 0.8 and ratio <= 1.2
@@ -1,5 +1,6 @@
1
+ import logging
1
2
  from logging import Logger
2
- from typing import Dict, List, Optional, Union
3
+ from typing import Dict, List, Optional, Tuple, Union
3
4
 
4
5
  import pandas as pd
5
6
 
@@ -12,7 +13,7 @@ from upgini.metadata import (
12
13
  ModelTaskType,
13
14
  SearchKey,
14
15
  )
15
- from upgini.resource_bundle import ResourceBundle
16
+ from upgini.resource_bundle import ResourceBundle, get_custom_bundle
16
17
  from upgini.utils.datetime_utils import DateTimeSearchKeyConverter
17
18
  from upgini.utils.target_utils import define_task
18
19
 
@@ -22,16 +23,19 @@ def remove_fintech_duplicates(
22
23
  search_keys: Dict[str, SearchKey],
23
24
  date_format: Optional[str] = None,
24
25
  logger: Optional[Logger] = None,
25
- silent=False,
26
26
  bundle: ResourceBundle = None,
27
- ) -> pd.DataFrame:
27
+ ) -> tuple[pd.DataFrame, Optional[List[str]]]:
28
28
  # Initial checks for target type and date column
29
+ bundle = bundle or get_custom_bundle()
30
+ if logger is None:
31
+ logger = logging.getLogger()
32
+ logger.setLevel(logging.FATAL)
29
33
  date_col = _get_column_by_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
30
34
  if define_task(df[TARGET], date_col is not None, silent=True) != ModelTaskType.BINARY:
31
- return df
35
+ return df, []
32
36
 
33
37
  if date_col is None:
34
- return df
38
+ return df, []
35
39
 
36
40
  personal_cols = []
37
41
  phone_col = _get_column_by_key(search_keys, SearchKey.PHONE)
@@ -44,7 +48,7 @@ def remove_fintech_duplicates(
44
48
  if hem_col:
45
49
  personal_cols.append(hem_col)
46
50
  if len(personal_cols) == 0:
47
- return df
51
+ return df, []
48
52
 
49
53
  # Splitting into train and eval_set parts
50
54
  if EVAL_SET_INDEX in df.columns:
@@ -54,11 +58,13 @@ def remove_fintech_duplicates(
54
58
  train_df = df
55
59
  eval_dfs = []
56
60
 
57
- def process_df(segment_df: pd.DataFrame, eval_index=0) -> pd.DataFrame:
61
+ warning_messages = []
62
+
63
+ def process_df(segment_df: pd.DataFrame, eval_index=0) -> tuple[pd.DataFrame, Optional[str]]:
58
64
  """Process a subset of the dataset to remove duplicates based on personal keys."""
59
65
  # Fast check for duplicates based on personal keys
60
66
  if not segment_df[personal_cols].duplicated().any():
61
- return segment_df
67
+ return segment_df, None
62
68
 
63
69
  sub_df = segment_df[personal_cols + [date_col, TARGET]].copy()
64
70
 
@@ -70,18 +76,18 @@ def remove_fintech_duplicates(
70
76
  total = len(uniques)
71
77
  diff_dates = len(uniques[uniques > 1])
72
78
  if diff_dates / total >= 0.6:
73
- return segment_df
79
+ return segment_df, None
74
80
 
75
81
  # Check for duplicate rows
76
82
  duplicates = sub_df.duplicated(personal_cols, keep=False)
77
83
  duplicate_rows = sub_df[duplicates]
78
84
  if len(duplicate_rows) == 0:
79
- return segment_df
85
+ return segment_df, None
80
86
 
81
87
  # Check if there are different target values for the same personal keys
82
88
  nonunique_target_groups = grouped_by_personal_cols[TARGET].nunique() > 1
83
89
  if nonunique_target_groups.sum() == 0:
84
- return segment_df
90
+ return segment_df, None
85
91
 
86
92
  # Helper function to check if there are different target values within 60 days
87
93
  def has_diff_target_within_60_days(rows: pd.DataFrame):
@@ -115,23 +121,23 @@ def remove_fintech_duplicates(
115
121
  msg = bundle.get("dataset_eval_diff_target_duplicates_fintech").format(
116
122
  perc, len(rows_to_remove), eval_index, rows_to_remove.index.to_list()
117
123
  )
118
- if not silent:
119
- print(msg)
120
- if logger:
121
- logger.warning(msg)
122
- return segment_df[~segment_df.index.isin(rows_to_remove.index)]
123
- return segment_df
124
+ return segment_df[~segment_df.index.isin(rows_to_remove.index)], msg
125
+ return segment_df, None
124
126
 
125
127
  # Process the train part separately
126
128
  logger.info(f"Train dataset shape before clean fintech duplicates: {train_df.shape}")
127
- train_df = process_df(train_df)
129
+ train_df, train_warning = process_df(train_df)
130
+ if train_warning:
131
+ warning_messages.append(train_warning)
128
132
  logger.info(f"Train dataset shape after clean fintech duplicates: {train_df.shape}")
129
133
 
130
134
  # Process each eval_set part separately
131
135
  new_eval_dfs = []
132
136
  for i, eval_df in enumerate(eval_dfs, 1):
133
137
  logger.info(f"Eval {i} dataset shape before clean fintech duplicates: {eval_df.shape}")
134
- cleaned_eval_df = process_df(eval_df, i)
138
+ cleaned_eval_df, eval_warning = process_df(eval_df, i)
139
+ if eval_warning:
140
+ warning_messages.append(eval_warning)
135
141
  logger.info(f"Eval {i} dataset shape after clean fintech duplicates: {cleaned_eval_df.shape}")
136
142
  new_eval_dfs.append(cleaned_eval_df)
137
143
 
@@ -143,15 +149,21 @@ def remove_fintech_duplicates(
143
149
  df = train_df
144
150
  logger.info(f"Dataset shape after clean fintech duplicates: {df.shape}")
145
151
 
146
- return df
152
+ return df, warning_messages
147
153
 
148
154
 
149
155
  def clean_full_duplicates(
150
- df: pd.DataFrame, logger: Optional[Logger] = None, silent=False, bundle: ResourceBundle = None
151
- ) -> pd.DataFrame:
156
+ df: pd.DataFrame, logger: Optional[Logger] = None, bundle: Optional[ResourceBundle] = None
157
+ ) -> Tuple[pd.DataFrame, Optional[str]]:
158
+ if logger is None:
159
+ logger = logging.getLogger()
160
+ logger.setLevel(logging.FATAL)
161
+ if bundle is None:
162
+ bundle = get_custom_bundle()
163
+
152
164
  nrows = len(df)
153
165
  if nrows == 0:
154
- return df
166
+ return df, None
155
167
  # Remove full duplicates (exclude system_record_id, sort_id and eval_set_index)
156
168
  unique_columns = df.columns.tolist()
157
169
  if SYSTEM_RECORD_ID in unique_columns:
@@ -162,6 +174,7 @@ def clean_full_duplicates(
162
174
  unique_columns.remove(SORT_ID)
163
175
  if EVAL_SET_INDEX in unique_columns:
164
176
  unique_columns.remove(EVAL_SET_INDEX)
177
+
165
178
  logger.info(f"Dataset shape before clean duplicates: {df.shape}")
166
179
  # Train segment goes first so if duplicates are found in train and eval set
167
180
  # then we keep unique rows in train segment
@@ -170,11 +183,9 @@ def clean_full_duplicates(
170
183
  nrows_after_full_dedup = len(df)
171
184
  share_full_dedup = 100 * (1 - nrows_after_full_dedup / nrows)
172
185
  if share_full_dedup > 0:
173
- msg = bundle.get("dataset_full_duplicates").format(share_full_dedup)
174
- logger.warning(msg)
175
- # if not silent_mode:
176
- # print(msg)
177
- # self.warning_counter.increment()
186
+ logger.warning(bundle.get("dataset_full_duplicates").format(share_full_dedup))
187
+
188
+ msg = None
178
189
  if TARGET in df.columns:
179
190
  unique_columns.remove(TARGET)
180
191
  marked_duplicates = df.duplicated(subset=unique_columns, keep=False)
@@ -185,13 +196,10 @@ def clean_full_duplicates(
185
196
  share_tgt_dedup = 100 * num_dup_rows / nrows_after_full_dedup
186
197
 
187
198
  msg = bundle.get("dataset_diff_target_duplicates").format(share_tgt_dedup, num_dup_rows, dups_indices)
188
- logger.warning(msg)
189
- if not silent:
190
- print(msg)
191
199
  df = df.drop_duplicates(subset=unique_columns, keep=False)
192
200
  logger.info(f"Dataset shape after clean invalid target duplicates: {df.shape}")
193
201
 
194
- return df
202
+ return df, msg
195
203
 
196
204
 
197
205
  def _get_column_by_key(search_keys: Dict[str, SearchKey], keys: Union[SearchKey, List[SearchKey]]) -> Optional[str]:
@@ -1,12 +1,11 @@
1
1
  import logging
2
2
  from logging import Logger
3
- from typing import Dict, List, Optional
3
+ from typing import Dict, List, Optional, Tuple
4
4
 
5
5
  import pandas as pd
6
6
  from pandas.api.types import is_integer_dtype, is_object_dtype, is_string_dtype
7
7
 
8
8
  from upgini.resource_bundle import bundle
9
- from upgini.utils.warning_counter import WarningCounter
10
9
 
11
10
 
12
11
  class FeaturesValidator:
@@ -21,13 +20,13 @@ class FeaturesValidator:
21
20
  self,
22
21
  df: pd.DataFrame,
23
22
  features: List[str],
24
- features_for_generate: Optional[List[str]],
25
- warning_counter: WarningCounter,
23
+ features_for_generate: Optional[List[str]] = None,
26
24
  columns_renaming: Optional[Dict[str, str]] = None,
27
- ) -> List[str]:
25
+ ) -> Tuple[List[str], List[str]]:
28
26
  # one_hot_encoded_features = []
29
27
  empty_or_constant_features = []
30
28
  high_cardinality_features = []
29
+ warnings = []
31
30
 
32
31
  for f in features:
33
32
  column = df[f]
@@ -52,9 +51,7 @@ class FeaturesValidator:
52
51
 
53
52
  # if one_hot_encoded_features:
54
53
  # msg = bundle.get("one_hot_encoded_features").format(one_hot_encoded_features)
55
- # print(msg)
56
- # self.logger.warning(msg)
57
- # warning_counter.increment()
54
+ # warnings.append(msg)
58
55
 
59
56
  columns_renaming = columns_renaming or {}
60
57
 
@@ -62,9 +59,7 @@ class FeaturesValidator:
62
59
  msg = bundle.get("empty_or_contant_features").format(
63
60
  [columns_renaming.get(f, f) for f in empty_or_constant_features]
64
61
  )
65
- print(msg)
66
- self.logger.warning(msg)
67
- warning_counter.increment()
62
+ warnings.append(msg)
68
63
 
69
64
  high_cardinality_features = self.find_high_cardinality(df[features])
70
65
  if features_for_generate:
@@ -75,11 +70,9 @@ class FeaturesValidator:
75
70
  msg = bundle.get("high_cardinality_features").format(
76
71
  [columns_renaming.get(f, f) for f in high_cardinality_features]
77
72
  )
78
- print(msg)
79
- self.logger.warning(msg)
80
- warning_counter.increment()
73
+ warnings.append(msg)
81
74
 
82
- return empty_or_constant_features + high_cardinality_features
75
+ return (empty_or_constant_features + high_cardinality_features, warnings)
83
76
 
84
77
  @staticmethod
85
78
  def find_high_cardinality(df: pd.DataFrame) -> List[str]:
@@ -4,6 +4,7 @@ class WarningCounter:
4
4
 
5
5
  def increment(self):
6
6
  self._count += 1
7
+ return self._count
7
8
 
8
9
  def reset(self):
9
10
  self._count = 0
@@ -1,5 +1,6 @@
1
1
  import json
2
2
  import threading
3
+ from typing import Callable, Optional
3
4
 
4
5
  import requests
5
6
 
@@ -30,15 +31,18 @@ def get_version(package, url_pattern=URL_PATTERN):
30
31
  return version
31
32
 
32
33
 
33
- def validate_version(logger: logging.Logger):
34
+ def validate_version(logger: logging.Logger, warning_function: Optional[Callable[[str], None]] = None):
34
35
  def task():
35
36
  try:
36
37
  current_version = parse(__version__)
37
38
  latest_version = get_version("upgini")
38
39
  if current_version < latest_version:
39
40
  msg = bundle.get("version_warning").format(current_version, latest_version)
40
- logger.warning(msg)
41
- print(msg)
41
+ if warning_function:
42
+ warning_function(msg)
43
+ else:
44
+ logger.warning(msg)
45
+ print(msg)
42
46
  except Exception:
43
47
  logger.warning("Failed to validate version", exc_info=True)
44
48
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: upgini
3
- Version: 1.2.22
3
+ Version: 1.2.23
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -1,16 +1,16 @@
1
- upgini/__about__.py,sha256=P6UdnfqZMN8bM1yBQGaUu5LMabVISCCurCBNtZJOvTE,23
1
+ upgini/__about__.py,sha256=y_Ev8AcJxzZe4ZJWlW3Wsver97OJUqU1nFSDqHzKBDw,23
2
2
  upgini/__init__.py,sha256=M64LwQTBa-5Jz24Zm2h8rWwlKQQ1J8nP7gGgIciS0WU,589
3
3
  upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
4
4
  upgini/dataset.py,sha256=iPFiMJtk4HF1ytw9wCQr8H9RfoOKj_TIo8XYZKWgcMc,31331
5
5
  upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
6
- upgini/features_enricher.py,sha256=CK_ymyXeS0JxzBxy2y2UJ7miwy0DUcwdJdJBoFNY0IE,193511
6
+ upgini/features_enricher.py,sha256=rctS3kRWwTJmU5X203t7sUZ_B40XYVBPeXy_0hPw2Ec,193667
7
7
  upgini/http.py,sha256=21asexflvavydzCOONJDGQBtQanCElrbnqLXakJ9Cu8,42880
8
8
  upgini/lazy_import.py,sha256=74gQ8JuA48BGRLxAo7lNHNKY2D2emMxrUxKGdxVGhuY,1012
9
9
  upgini/metadata.py,sha256=osmzdNESeh7yP3BZday6N9Q3eaIHfzhhRM1d6NSgcf0,11223
10
- upgini/metrics.py,sha256=M508zOvqg0uc2sSgS8fpU7uNjGv1JA6iW_gDDOq-6PI,34474
10
+ upgini/metrics.py,sha256=10Cg_6cqIOsZyz2tO4GGbCxiBH7lGb35Vh-pR6IUzLg,34459
11
11
  upgini/search_task.py,sha256=qxUxAD-bed-FpZYmTB_4orW7YJsW_O6a1TcgnZIRFr4,17307
12
12
  upgini/spinner.py,sha256=4iMd-eIe_BnkqFEMIliULTbj6rNI2HkN_VJ4qYe0cUc,1118
13
- upgini/version_validator.py,sha256=ddSKUK_-eGJB3NgrqOMoWJU-OxQ253WsNLp8aqJkaIM,1389
13
+ upgini/version_validator.py,sha256=h1GViOWzULy5vf6M4dpTJuIk-4V38UCrTY1sb9yLa5I,1594
14
14
  upgini/ads_management/__init__.py,sha256=qzyisOToVRP-tquAJD1PblZhNtMrOB8FiyF9JvfkvgE,50
15
15
  upgini/ads_management/ads_manager.py,sha256=igVbN2jz80Umb2BUJixmJVj-zx8unoKpecVo-R-nGdw,2648
16
16
  upgini/autofe/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -27,10 +27,10 @@ upgini/data_source/data_source_publisher.py,sha256=X-8aGtVgzGmxyXkMVBoBLIGDMb4lY
27
27
  upgini/mdc/__init__.py,sha256=aM08nIWFc2gWdWUa3_IuEnNND0cQPkBGnYpRMnfFN8k,1019
28
28
  upgini/mdc/context.py,sha256=3u1B-jXt7tXEvNcV3qmR9SDCseudnY7KYsLclBdwVLk,1405
29
29
  upgini/normalizer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
30
- upgini/normalizer/normalize_utils.py,sha256=Lv75lq7M46z9cAIutwkdKZtPZkWblgoRzToAJ1BwY8A,7709
30
+ upgini/normalizer/normalize_utils.py,sha256=Ft2MwSgVoBilXAORAOYAuwPD79GOLfwn4qQE3IUFzzg,7218
31
31
  upgini/resource_bundle/__init__.py,sha256=S5F2G47pnJd2LDpmFsjDqEwiKkP8Hm-hcseDbMka6Ko,8345
32
32
  upgini/resource_bundle/exceptions.py,sha256=5fRvx0_vWdE1-7HcSgF0tckB4A9AKyf5RiinZkInTsI,621
33
- upgini/resource_bundle/strings.properties,sha256=bWWznzu43Lwfd-j4XDrpKJCpoxMMThd73awB7ge7wfo,27319
33
+ upgini/resource_bundle/strings.properties,sha256=wxdxH13ncXjRion__GCM_ecllCDjGqOhOxZ41beFslg,26665
34
34
  upgini/resource_bundle/strings_widget.properties,sha256=gOdqvZWntP2LCza_tyVk1_yRYcG4c04K9sQOAVhF_gw,1577
35
35
  upgini/sampler/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
36
36
  upgini/sampler/base.py,sha256=7GpjYqjOp58vYcJLiX__1R5wjUlyQbxvHJ2klFnup_M,6389
@@ -42,12 +42,12 @@ upgini/utils/blocked_time_series.py,sha256=Uqr3vp4YqNclj2-PzEYqVy763GSXHn86sbpIl
42
42
  upgini/utils/country_utils.py,sha256=lY-eXWwFVegdVENFttbvLcgGDjFO17Sex8hd2PyJaRk,6937
43
43
  upgini/utils/custom_loss_utils.py,sha256=kieNZYBYZm5ZGBltF1F_jOSF4ea6C29rYuCyiDcqVNY,3857
44
44
  upgini/utils/cv_utils.py,sha256=w6FQb9nO8BWDx88EF83NpjPLarK4eR4ia0Wg0kLBJC4,3525
45
- upgini/utils/datetime_utils.py,sha256=4tsGeehU0KS6wqNsc9gEEWZ9s6T9E0UReUIO3rSuXNU,12174
46
- upgini/utils/deduplicate_utils.py,sha256=NpaPtBYXwUtfKTRHWrtz2uUq6tZN6C_Nd719ydPRF2Q,8484
45
+ upgini/utils/datetime_utils.py,sha256=a8X4jX2y3-6E7ZNZIG5z61qfzCvsvaNEjR1Bi5KUqfM,11279
46
+ upgini/utils/deduplicate_utils.py,sha256=kINO1KoH8kPRA3JSYogzv4jaUP1Ceguv5etBPtLcsSw,8855
47
47
  upgini/utils/display_utils.py,sha256=NGhki1aGMsS8OeI69eLXEpmS_s41k8ojKHQxacJaXiU,11493
48
48
  upgini/utils/email_utils.py,sha256=j0Ug1R_0AnCg1Y92zIZ4XMwvKo3G5_pcOlBN1OH_gZs,5191
49
49
  upgini/utils/fallback_progress_bar.py,sha256=PDaKb8dYpVZaWMroNcOHsTc3pSjgi9mOm0--cOFTwJ0,1074
50
- upgini/utils/features_validator.py,sha256=yiOdzVtpArELMufzAa9mtWq32lETB6sIF-w3Yvl3vV8,3614
50
+ upgini/utils/features_validator.py,sha256=1Xj2ir5LzzYiX3NH8o88c2J6RTTetaTwu0MhjLTyuvM,3378
51
51
  upgini/utils/format.py,sha256=Yv5cvvSs2bOLUzzNu96Pu33VMDNbabio92QepUj41jU,243
52
52
  upgini/utils/ip_utils.py,sha256=Q6vb7Sr5Khx3Sq3eENjW2qCXKej_S5jZbneH6zEOkzQ,5171
53
53
  upgini/utils/phone_utils.py,sha256=IrbztLuOJBiePqqxllfABWfYlfAjYevPhXKipl95wUI,10432
@@ -56,8 +56,8 @@ upgini/utils/progress_bar.py,sha256=N-Sfdah2Hg8lXP_fV9EfUTXz_PyRt4lo9fAHoUDOoLc,
56
56
  upgini/utils/sklearn_ext.py,sha256=13jQS_k7v0aUtudXV6nGUEWjttPQzAW9AFYL5wgEz9k,44511
57
57
  upgini/utils/target_utils.py,sha256=qHzZRmICFbLNCrmVqGkaBcjm91L2ERRZMppci36acV4,10085
58
58
  upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
59
- upgini/utils/warning_counter.py,sha256=dIWBB4dI5XRRJZudvIlqlIYKEiwLLPcXarsZuYRt338,227
60
- upgini-1.2.22.dist-info/METADATA,sha256=xz213bCp7FlucAgHEqT8KlX7G0E_BMwP3wN444cz3QU,48578
61
- upgini-1.2.22.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
62
- upgini-1.2.22.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
63
- upgini-1.2.22.dist-info/RECORD,,
59
+ upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
60
+ upgini-1.2.23.dist-info/METADATA,sha256=DGV0FR8F9_7casA5R9U3b22oSUhXuZeX0RfNGnMgnQ8,48578
61
+ upgini-1.2.23.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
62
+ upgini-1.2.23.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
63
+ upgini-1.2.23.dist-info/RECORD,,