upgini 1.2.21__tar.gz → 1.2.23__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

Files changed (65) hide show
  1. {upgini-1.2.21 → upgini-1.2.23}/PKG-INFO +1 -1
  2. upgini-1.2.23/src/upgini/__about__.py +1 -0
  3. {upgini-1.2.21 → upgini-1.2.23}/src/upgini/features_enricher.py +81 -79
  4. {upgini-1.2.21 → upgini-1.2.23}/src/upgini/metrics.py +13 -8
  5. {upgini-1.2.21 → upgini-1.2.23}/src/upgini/normalizer/normalize_utils.py +2 -14
  6. {upgini-1.2.21 → upgini-1.2.23}/src/upgini/resource_bundle/strings.properties +45 -48
  7. {upgini-1.2.21 → upgini-1.2.23}/src/upgini/utils/datetime_utils.py +5 -26
  8. {upgini-1.2.21 → upgini-1.2.23}/src/upgini/utils/deduplicate_utils.py +41 -33
  9. {upgini-1.2.21 → upgini-1.2.23}/src/upgini/utils/features_validator.py +8 -15
  10. {upgini-1.2.21 → upgini-1.2.23}/src/upgini/utils/warning_counter.py +1 -0
  11. {upgini-1.2.21 → upgini-1.2.23}/src/upgini/version_validator.py +7 -3
  12. upgini-1.2.21/src/upgini/__about__.py +0 -1
  13. {upgini-1.2.21 → upgini-1.2.23}/.gitignore +0 -0
  14. {upgini-1.2.21 → upgini-1.2.23}/LICENSE +0 -0
  15. {upgini-1.2.21 → upgini-1.2.23}/README.md +0 -0
  16. {upgini-1.2.21 → upgini-1.2.23}/pyproject.toml +0 -0
  17. {upgini-1.2.21 → upgini-1.2.23}/src/upgini/__init__.py +0 -0
  18. {upgini-1.2.21 → upgini-1.2.23}/src/upgini/ads.py +0 -0
  19. {upgini-1.2.21 → upgini-1.2.23}/src/upgini/ads_management/__init__.py +0 -0
  20. {upgini-1.2.21 → upgini-1.2.23}/src/upgini/ads_management/ads_manager.py +0 -0
  21. {upgini-1.2.21 → upgini-1.2.23}/src/upgini/autofe/__init__.py +0 -0
  22. {upgini-1.2.21 → upgini-1.2.23}/src/upgini/autofe/all_operands.py +0 -0
  23. {upgini-1.2.21 → upgini-1.2.23}/src/upgini/autofe/binary.py +0 -0
  24. {upgini-1.2.21 → upgini-1.2.23}/src/upgini/autofe/date.py +0 -0
  25. {upgini-1.2.21 → upgini-1.2.23}/src/upgini/autofe/feature.py +0 -0
  26. {upgini-1.2.21 → upgini-1.2.23}/src/upgini/autofe/groupby.py +0 -0
  27. {upgini-1.2.21 → upgini-1.2.23}/src/upgini/autofe/operand.py +0 -0
  28. {upgini-1.2.21 → upgini-1.2.23}/src/upgini/autofe/unary.py +0 -0
  29. {upgini-1.2.21 → upgini-1.2.23}/src/upgini/autofe/vector.py +0 -0
  30. {upgini-1.2.21 → upgini-1.2.23}/src/upgini/data_source/__init__.py +0 -0
  31. {upgini-1.2.21 → upgini-1.2.23}/src/upgini/data_source/data_source_publisher.py +0 -0
  32. {upgini-1.2.21 → upgini-1.2.23}/src/upgini/dataset.py +0 -0
  33. {upgini-1.2.21 → upgini-1.2.23}/src/upgini/errors.py +0 -0
  34. {upgini-1.2.21 → upgini-1.2.23}/src/upgini/http.py +0 -0
  35. {upgini-1.2.21 → upgini-1.2.23}/src/upgini/lazy_import.py +0 -0
  36. {upgini-1.2.21 → upgini-1.2.23}/src/upgini/mdc/__init__.py +0 -0
  37. {upgini-1.2.21 → upgini-1.2.23}/src/upgini/mdc/context.py +0 -0
  38. {upgini-1.2.21 → upgini-1.2.23}/src/upgini/metadata.py +0 -0
  39. {upgini-1.2.21 → upgini-1.2.23}/src/upgini/normalizer/__init__.py +0 -0
  40. {upgini-1.2.21 → upgini-1.2.23}/src/upgini/resource_bundle/__init__.py +0 -0
  41. {upgini-1.2.21 → upgini-1.2.23}/src/upgini/resource_bundle/exceptions.py +0 -0
  42. {upgini-1.2.21 → upgini-1.2.23}/src/upgini/resource_bundle/strings_widget.properties +0 -0
  43. {upgini-1.2.21 → upgini-1.2.23}/src/upgini/sampler/__init__.py +0 -0
  44. {upgini-1.2.21 → upgini-1.2.23}/src/upgini/sampler/base.py +0 -0
  45. {upgini-1.2.21 → upgini-1.2.23}/src/upgini/sampler/random_under_sampler.py +0 -0
  46. {upgini-1.2.21 → upgini-1.2.23}/src/upgini/sampler/utils.py +0 -0
  47. {upgini-1.2.21 → upgini-1.2.23}/src/upgini/search_task.py +0 -0
  48. {upgini-1.2.21 → upgini-1.2.23}/src/upgini/spinner.py +0 -0
  49. {upgini-1.2.21 → upgini-1.2.23}/src/upgini/utils/__init__.py +0 -0
  50. {upgini-1.2.21 → upgini-1.2.23}/src/upgini/utils/base_search_key_detector.py +0 -0
  51. {upgini-1.2.21 → upgini-1.2.23}/src/upgini/utils/blocked_time_series.py +0 -0
  52. {upgini-1.2.21 → upgini-1.2.23}/src/upgini/utils/country_utils.py +0 -0
  53. {upgini-1.2.21 → upgini-1.2.23}/src/upgini/utils/custom_loss_utils.py +0 -0
  54. {upgini-1.2.21 → upgini-1.2.23}/src/upgini/utils/cv_utils.py +0 -0
  55. {upgini-1.2.21 → upgini-1.2.23}/src/upgini/utils/display_utils.py +0 -0
  56. {upgini-1.2.21 → upgini-1.2.23}/src/upgini/utils/email_utils.py +0 -0
  57. {upgini-1.2.21 → upgini-1.2.23}/src/upgini/utils/fallback_progress_bar.py +0 -0
  58. {upgini-1.2.21 → upgini-1.2.23}/src/upgini/utils/format.py +0 -0
  59. {upgini-1.2.21 → upgini-1.2.23}/src/upgini/utils/ip_utils.py +0 -0
  60. {upgini-1.2.21 → upgini-1.2.23}/src/upgini/utils/phone_utils.py +0 -0
  61. {upgini-1.2.21 → upgini-1.2.23}/src/upgini/utils/postal_code_utils.py +0 -0
  62. {upgini-1.2.21 → upgini-1.2.23}/src/upgini/utils/progress_bar.py +0 -0
  63. {upgini-1.2.21 → upgini-1.2.23}/src/upgini/utils/sklearn_ext.py +0 -0
  64. {upgini-1.2.21 → upgini-1.2.23}/src/upgini/utils/target_utils.py +0 -0
  65. {upgini-1.2.21 → upgini-1.2.23}/src/upgini/utils/track_info.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: upgini
3
- Version: 1.2.21
3
+ Version: 1.2.23
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -0,0 +1 @@
1
+ __version__ = "1.2.23"
@@ -77,8 +77,8 @@ from upgini.utils.cv_utils import CVConfig, get_groups
77
77
  from upgini.utils.datetime_utils import (
78
78
  DateTimeSearchKeyConverter,
79
79
  is_blocked_time_series,
80
+ is_dates_distribution_valid,
80
81
  is_time_series,
81
- validate_dates_distribution,
82
82
  )
83
83
  from upgini.utils.deduplicate_utils import (
84
84
  clean_full_duplicates,
@@ -263,7 +263,7 @@ class FeaturesEnricher(TransformerMixin):
263
263
  dict()
264
264
  )
265
265
 
266
- validate_version(self.logger)
266
+ validate_version(self.logger, self.__log_warning)
267
267
  self.search_keys = search_keys or {}
268
268
  self.country_code = country_code
269
269
  self.__validate_search_keys(search_keys, search_id)
@@ -723,7 +723,7 @@ class FeaturesEnricher(TransformerMixin):
723
723
 
724
724
  start_time = time.time()
725
725
  try:
726
- result, _ = self.__inner_transform(
726
+ result, _, _ = self.__inner_transform(
727
727
  trace_id,
728
728
  X,
729
729
  exclude_features_sources=exclude_features_sources,
@@ -951,9 +951,7 @@ class FeaturesEnricher(TransformerMixin):
951
951
  gc.collect()
952
952
 
953
953
  if fitting_X.shape[1] == 0 and fitting_enriched_X.shape[1] == 0:
954
- print(self.bundle.get("metrics_no_important_free_features"))
955
- self.logger.warning("No client or free relevant ADS features found to calculate metrics")
956
- self.warning_counter.increment()
954
+ self.__log_warning(self.bundle.get("metrics_no_important_free_features"))
957
955
  return None
958
956
 
959
957
  print(self.bundle.get("metrics_start"))
@@ -1654,9 +1652,7 @@ class FeaturesEnricher(TransformerMixin):
1654
1652
  date_column = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
1655
1653
  generated_features = []
1656
1654
  if date_column is not None:
1657
- converter = DateTimeSearchKeyConverter(
1658
- date_column, self.date_format, self.logger, self.bundle, silent_mode=True
1659
- )
1655
+ converter = DateTimeSearchKeyConverter(date_column, self.date_format, self.logger, self.bundle)
1660
1656
  df = converter.convert(df, keep_time=True)
1661
1657
  generated_features = converter.generated_features
1662
1658
 
@@ -1666,11 +1662,11 @@ class FeaturesEnricher(TransformerMixin):
1666
1662
  df = generator.generate(df)
1667
1663
  generated_features.extend(generator.generated_features)
1668
1664
 
1669
- normalizer = Normalizer(self.bundle, self.logger, self.warning_counter)
1665
+ normalizer = Normalizer(self.bundle, self.logger)
1670
1666
  df, search_keys, generated_features = normalizer.normalize(df, search_keys, generated_features)
1671
1667
  columns_renaming = normalizer.columns_renaming
1672
1668
 
1673
- df = clean_full_duplicates(df, logger=self.logger, silent=True, bundle=self.bundle)
1669
+ df, _ = clean_full_duplicates(df, logger=self.logger, bundle=self.bundle)
1674
1670
 
1675
1671
  num_samples = _num_samples(df)
1676
1672
  sample_threshold, sample_rows = (
@@ -1817,7 +1813,7 @@ class FeaturesEnricher(TransformerMixin):
1817
1813
  eval_df_with_index[EVAL_SET_INDEX] = idx + 1
1818
1814
  df = pd.concat([df, eval_df_with_index])
1819
1815
 
1820
- df = clean_full_duplicates(df, logger=self.logger, silent=True, bundle=self.bundle)
1816
+ df, _ = clean_full_duplicates(df, logger=self.logger, bundle=self.bundle)
1821
1817
 
1822
1818
  # downsample if need to eval_set threshold
1823
1819
  num_samples = _num_samples(df)
@@ -1830,7 +1826,7 @@ class FeaturesEnricher(TransformerMixin):
1830
1826
  tmp_target_name = "__target"
1831
1827
  df = df.rename(columns={TARGET: tmp_target_name})
1832
1828
 
1833
- enriched_df, columns_renaming = self.__inner_transform(
1829
+ enriched_df, columns_renaming, generated_features = self.__inner_transform(
1834
1830
  trace_id,
1835
1831
  df,
1836
1832
  exclude_features_sources=exclude_features_sources,
@@ -1847,7 +1843,7 @@ class FeaturesEnricher(TransformerMixin):
1847
1843
 
1848
1844
  x_columns = [
1849
1845
  c
1850
- for c in (validated_X.columns.tolist() + self.fit_generated_features + [SYSTEM_RECORD_ID])
1846
+ for c in (validated_X.columns.tolist() + generated_features + [SYSTEM_RECORD_ID])
1851
1847
  if c in enriched_df.columns
1852
1848
  ]
1853
1849
 
@@ -1869,7 +1865,7 @@ class FeaturesEnricher(TransformerMixin):
1869
1865
 
1870
1866
  df[TARGET] = validated_y
1871
1867
 
1872
- df = clean_full_duplicates(df, logger=self.logger, silent=True, bundle=self.bundle)
1868
+ df, _ = clean_full_duplicates(df, logger=self.logger, bundle=self.bundle)
1873
1869
 
1874
1870
  num_samples = _num_samples(df)
1875
1871
  if num_samples > Dataset.FIT_SAMPLE_THRESHOLD:
@@ -1879,7 +1875,7 @@ class FeaturesEnricher(TransformerMixin):
1879
1875
  tmp_target_name = "__target"
1880
1876
  df = df.rename(columns={TARGET: tmp_target_name})
1881
1877
 
1882
- enriched_Xy, columns_renaming = self.__inner_transform(
1878
+ enriched_Xy, columns_renaming, generated_features = self.__inner_transform(
1883
1879
  trace_id,
1884
1880
  df,
1885
1881
  exclude_features_sources=exclude_features_sources,
@@ -1896,7 +1892,7 @@ class FeaturesEnricher(TransformerMixin):
1896
1892
 
1897
1893
  x_columns = [
1898
1894
  c
1899
- for c in (validated_X.columns.tolist() + self.fit_generated_features + [SYSTEM_RECORD_ID])
1895
+ for c in (validated_X.columns.tolist() + generated_features + [SYSTEM_RECORD_ID])
1900
1896
  if c in enriched_Xy.columns
1901
1897
  ]
1902
1898
 
@@ -1904,7 +1900,7 @@ class FeaturesEnricher(TransformerMixin):
1904
1900
  y_sampled = enriched_Xy[TARGET].copy()
1905
1901
  enriched_X = enriched_Xy.drop(columns=TARGET)
1906
1902
 
1907
- datasets_hash = hash_input(X_sampled, y_sampled, eval_set_sampled_dict)
1903
+ datasets_hash = hash_input(validated_X, validated_y, eval_set)
1908
1904
  self.__cached_sampled_datasets[datasets_hash] = (
1909
1905
  X_sampled,
1910
1906
  y_sampled,
@@ -2023,7 +2019,7 @@ class FeaturesEnricher(TransformerMixin):
2023
2019
  progress_bar: Optional[ProgressBar] = None,
2024
2020
  progress_callback: Optional[Callable[[SearchProgress], Any]] = None,
2025
2021
  add_fit_system_record_id: bool = False,
2026
- ) -> Tuple[pd.DataFrame, Dict[str, str]]:
2022
+ ) -> Tuple[pd.DataFrame, Dict[str, str], List[str]]:
2027
2023
  if self._search_task is None:
2028
2024
  raise NotFittedError(self.bundle.get("transform_unfitted_enricher"))
2029
2025
 
@@ -2036,24 +2032,25 @@ class FeaturesEnricher(TransformerMixin):
2036
2032
 
2037
2033
  if len(self.feature_names_) == 0:
2038
2034
  self.logger.warning(self.bundle.get("no_important_features_for_transform"))
2039
- return X, {c: c for c in X.columns}
2035
+ return X, {c: c for c in X.columns}, []
2040
2036
 
2041
2037
  if self._has_paid_features(exclude_features_sources):
2042
2038
  msg = self.bundle.get("transform_with_paid_features")
2043
2039
  self.logger.warning(msg)
2044
2040
  self.__display_support_link(msg)
2045
- return None, {c: c for c in X.columns}
2041
+ return None, {c: c for c in X.columns}, []
2046
2042
 
2047
2043
  if not metrics_calculation:
2048
2044
  transform_usage = self.rest_client.get_current_transform_usage(trace_id)
2049
2045
  self.logger.info(f"Current transform usage: {transform_usage}. Transforming {len(X)} rows")
2050
2046
  if transform_usage.has_limit:
2051
2047
  if len(X) > transform_usage.rest_rows:
2052
- msg = self.bundle.get("transform_usage_warning").format(len(X), transform_usage.rest_rows)
2048
+ rest_rows = max(transform_usage.rest_rows, 0)
2049
+ msg = self.bundle.get("transform_usage_warning").format(len(X), rest_rows)
2053
2050
  self.logger.warning(msg)
2054
2051
  print(msg)
2055
2052
  show_request_quote_button()
2056
- return None, {c: c for c in X.columns}
2053
+ return None, {c: c for c in X.columns}, []
2057
2054
  else:
2058
2055
  msg = self.bundle.get("transform_usage_info").format(
2059
2056
  transform_usage.limit, transform_usage.transformed_rows
@@ -2093,9 +2090,7 @@ class FeaturesEnricher(TransformerMixin):
2093
2090
  generated_features = []
2094
2091
  date_column = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
2095
2092
  if date_column is not None:
2096
- converter = DateTimeSearchKeyConverter(
2097
- date_column, self.date_format, self.logger, bundle=self.bundle, silent_mode=silent_mode
2098
- )
2093
+ converter = DateTimeSearchKeyConverter(date_column, self.date_format, self.logger, bundle=self.bundle)
2099
2094
  df = converter.convert(df)
2100
2095
  self.logger.info(f"Date column after convertion: {df[date_column]}")
2101
2096
  generated_features.extend(converter.generated_features)
@@ -2110,7 +2105,7 @@ class FeaturesEnricher(TransformerMixin):
2110
2105
  df = generator.generate(df)
2111
2106
  generated_features.extend(generator.generated_features)
2112
2107
 
2113
- normalizer = Normalizer(self.bundle, self.logger, self.warning_counter, silent_mode)
2108
+ normalizer = Normalizer(self.bundle, self.logger)
2114
2109
  df, search_keys, generated_features = normalizer.normalize(df, search_keys, generated_features)
2115
2110
  columns_renaming = normalizer.columns_renaming
2116
2111
 
@@ -2176,7 +2171,7 @@ class FeaturesEnricher(TransformerMixin):
2176
2171
  converter = PostalCodeSearchKeyConverter(postal_code)
2177
2172
  df = converter.convert(df)
2178
2173
 
2179
- generated_features = [f for f in generated_features if f in self.fit_generated_features]
2174
+ # generated_features = [f for f in generated_features if f in self.fit_generated_features]
2180
2175
 
2181
2176
  meaning_types = {col: key.value for col, key in search_keys.items()}
2182
2177
  for col in features_for_transform:
@@ -2216,9 +2211,11 @@ class FeaturesEnricher(TransformerMixin):
2216
2211
 
2217
2212
  df_without_features = df.drop(columns=features_not_to_pass)
2218
2213
 
2219
- df_without_features = clean_full_duplicates(
2220
- df_without_features, self.logger, silent=silent_mode, bundle=self.bundle
2214
+ df_without_features, full_duplicates_warning = clean_full_duplicates(
2215
+ df_without_features, self.logger, bundle=self.bundle
2221
2216
  )
2217
+ if not silent_mode and full_duplicates_warning:
2218
+ self.__log_warning(full_duplicates_warning)
2222
2219
 
2223
2220
  del df
2224
2221
  gc.collect()
@@ -2337,7 +2334,7 @@ class FeaturesEnricher(TransformerMixin):
2337
2334
  if add_fit_system_record_id:
2338
2335
  result = result.rename(columns={SORT_ID: SYSTEM_RECORD_ID})
2339
2336
 
2340
- return result, columns_renaming
2337
+ return result, columns_renaming, generated_features
2341
2338
 
2342
2339
  def _get_excluded_features(self, max_features: Optional[int], importance_threshold: Optional[float]) -> List[str]:
2343
2340
  features_info = self._internal_features_info
@@ -2415,6 +2412,15 @@ class FeaturesEnricher(TransformerMixin):
2415
2412
  def __is_registered(self) -> bool:
2416
2413
  return self.api_key is not None and self.api_key != ""
2417
2414
 
2415
+ def __log_warning(self, message: str, show_support_link: bool = False):
2416
+ warning_num = self.warning_counter.increment()
2417
+ formatted_message = f"WARNING #{warning_num}: {message}\n"
2418
+ if show_support_link:
2419
+ self.__display_support_link(formatted_message)
2420
+ else:
2421
+ print(formatted_message)
2422
+ self.logger.warning(message)
2423
+
2418
2424
  def __inner_fit(
2419
2425
  self,
2420
2426
  trace_id: str,
@@ -2461,9 +2467,7 @@ class FeaturesEnricher(TransformerMixin):
2461
2467
  checked_generate_features = []
2462
2468
  for gen_feature in self.generate_features:
2463
2469
  if gen_feature not in x_columns:
2464
- msg = self.bundle.get("missing_generate_feature").format(gen_feature, x_columns)
2465
- print(msg)
2466
- self.logger.warning(msg)
2470
+ self.__log_warning(self.bundle.get("missing_generate_feature").format(gen_feature, x_columns))
2467
2471
  else:
2468
2472
  checked_generate_features.append(gen_feature)
2469
2473
  self.generate_features = checked_generate_features
@@ -2524,9 +2528,10 @@ class FeaturesEnricher(TransformerMixin):
2524
2528
  self.date_format,
2525
2529
  self.logger,
2526
2530
  bundle=self.bundle,
2527
- warnings_counter=self.warning_counter,
2528
2531
  )
2529
2532
  df = converter.convert(df, keep_time=True)
2533
+ if converter.has_old_dates:
2534
+ self.__log_warning(self.bundle.get("dataset_drop_old_dates"))
2530
2535
  self.logger.info(f"Date column after convertion: {df[maybe_date_column]}")
2531
2536
  self.fit_generated_features.extend(converter.generated_features)
2532
2537
  else:
@@ -2541,23 +2546,36 @@ class FeaturesEnricher(TransformerMixin):
2541
2546
  self.fit_generated_features.extend(generator.generated_features)
2542
2547
 
2543
2548
  # Checks that need validated date
2544
- validate_dates_distribution(df, self.fit_search_keys, self.logger, self.bundle, self.warning_counter)
2545
2549
 
2546
- if is_numeric_dtype(df[self.TARGET_NAME]) and has_date:
2550
+ if not is_dates_distribution_valid(df, self.fit_search_keys):
2551
+ self.__log_warning(bundle.get("x_unstable_by_date"))
2552
+
2553
+ if (
2554
+ is_numeric_dtype(df[self.TARGET_NAME])
2555
+ and self.model_task_type in [ModelTaskType.BINARY, ModelTaskType.MULTICLASS]
2556
+ and has_date
2557
+ ):
2547
2558
  self._validate_PSI(df.sort_values(by=maybe_date_column))
2548
2559
 
2549
- normalizer = Normalizer(self.bundle, self.logger, self.warning_counter)
2560
+ normalizer = Normalizer(self.bundle, self.logger)
2550
2561
  df, self.fit_search_keys, self.fit_generated_features = normalizer.normalize(
2551
2562
  df, self.fit_search_keys, self.fit_generated_features
2552
2563
  )
2553
2564
  self.fit_columns_renaming = normalizer.columns_renaming
2565
+ if normalizer.removed_features:
2566
+ self.__log_warning(self.bundle.get("dataset_date_features").format(normalizer.removed_features))
2554
2567
 
2555
2568
  self.__adjust_cv(df)
2556
2569
 
2557
- df = remove_fintech_duplicates(
2570
+ df, fintech_warnings = remove_fintech_duplicates(
2558
2571
  df, self.fit_search_keys, date_format=self.date_format, logger=self.logger, bundle=self.bundle
2559
2572
  )
2560
- df = clean_full_duplicates(df, self.logger, bundle=self.bundle)
2573
+ if fintech_warnings:
2574
+ for fintech_warning in fintech_warnings:
2575
+ self.__log_warning(fintech_warning)
2576
+ df, full_duplicates_warning = clean_full_duplicates(df, self.logger, bundle=self.bundle)
2577
+ if full_duplicates_warning:
2578
+ self.__log_warning(full_duplicates_warning)
2561
2579
 
2562
2580
  # Explode multiple search keys
2563
2581
  df = self.__add_fit_system_record_id(df, self.fit_search_keys, ENTITY_SYSTEM_RECORD_ID)
@@ -2617,9 +2635,12 @@ class FeaturesEnricher(TransformerMixin):
2617
2635
 
2618
2636
  features_columns = [c for c in df.columns if c not in non_feature_columns]
2619
2637
 
2620
- features_to_drop = FeaturesValidator(self.logger).validate(
2621
- df, features_columns, self.generate_features, self.warning_counter, self.fit_columns_renaming
2638
+ features_to_drop, feature_validator_warnings = FeaturesValidator(self.logger).validate(
2639
+ df, features_columns, self.generate_features, self.fit_columns_renaming
2622
2640
  )
2641
+ if feature_validator_warnings:
2642
+ for warning in feature_validator_warnings:
2643
+ self.__log_warning(warning)
2623
2644
  self.fit_dropped_features.update(features_to_drop)
2624
2645
  df = df.drop(columns=features_to_drop)
2625
2646
 
@@ -2735,9 +2756,7 @@ class FeaturesEnricher(TransformerMixin):
2735
2756
  zero_hit_columns = self.get_columns_by_search_keys(zero_hit_search_keys)
2736
2757
  if zero_hit_columns:
2737
2758
  msg = self.bundle.get("features_info_zero_hit_rate_search_keys").format(zero_hit_columns)
2738
- self.logger.warning(msg)
2739
- self.__display_support_link(msg)
2740
- self.warning_counter.increment()
2759
+ self.__log_warning(msg, show_support_link=True)
2741
2760
 
2742
2761
  if (
2743
2762
  self._search_task.unused_features_for_generation is not None
@@ -2747,9 +2766,7 @@ class FeaturesEnricher(TransformerMixin):
2747
2766
  dataset.columns_renaming.get(col) or col for col in self._search_task.unused_features_for_generation
2748
2767
  ]
2749
2768
  msg = self.bundle.get("features_not_generated").format(unused_features_for_generation)
2750
- self.logger.warning(msg)
2751
- print(msg)
2752
- self.warning_counter.increment()
2769
+ self.__log_warning(msg)
2753
2770
 
2754
2771
  self.__prepare_feature_importances(trace_id, validated_X.columns.to_list() + self.fit_generated_features)
2755
2772
 
@@ -3150,7 +3167,7 @@ class FeaturesEnricher(TransformerMixin):
3150
3167
  maybe_date_col = SearchKey.find_key(self.search_keys, [SearchKey.DATE, SearchKey.DATETIME])
3151
3168
  if X is not None and maybe_date_col is not None and maybe_date_col in X.columns:
3152
3169
  # TODO cast date column to single dtype
3153
- date_converter = DateTimeSearchKeyConverter(maybe_date_col, self.date_format, silent_mode=True)
3170
+ date_converter = DateTimeSearchKeyConverter(maybe_date_col, self.date_format)
3154
3171
  converted_X = date_converter.convert(X)
3155
3172
  min_date = converted_X[maybe_date_col].min()
3156
3173
  max_date = converted_X[maybe_date_col].max()
@@ -3192,7 +3209,7 @@ class FeaturesEnricher(TransformerMixin):
3192
3209
  logger.warning(msg)
3193
3210
  df[FeaturesEnricher.CURRENT_DATE] = datetime.date.today()
3194
3211
  search_keys[FeaturesEnricher.CURRENT_DATE] = SearchKey.DATE
3195
- converter = DateTimeSearchKeyConverter(FeaturesEnricher.CURRENT_DATE, silent_mode=True)
3212
+ converter = DateTimeSearchKeyConverter(FeaturesEnricher.CURRENT_DATE)
3196
3213
  df = converter.convert(df)
3197
3214
  return df
3198
3215
 
@@ -3764,15 +3781,15 @@ class FeaturesEnricher(TransformerMixin):
3764
3781
  if meaning_type == SearchKey.COUNTRY and self.country_code is not None:
3765
3782
  msg = self.bundle.get("search_key_country_and_country_code")
3766
3783
  self.logger.warning(msg)
3767
- print(msg)
3784
+ if not silent_mode:
3785
+ self.__log_warning(msg)
3768
3786
  self.country_code = None
3769
3787
 
3770
3788
  if not self.__is_registered and not is_demo_dataset and meaning_type in SearchKey.personal_keys():
3771
3789
  msg = self.bundle.get("unregistered_with_personal_keys").format(meaning_type)
3772
3790
  self.logger.warning(msg)
3773
3791
  if not silent_mode:
3774
- self.warning_counter.increment()
3775
- print(msg)
3792
+ self.__log_warning(msg)
3776
3793
 
3777
3794
  valid_search_keys[column_name] = SearchKey.CUSTOM_KEY
3778
3795
  else:
@@ -3806,27 +3823,22 @@ class FeaturesEnricher(TransformerMixin):
3806
3823
  and not silent_mode
3807
3824
  ):
3808
3825
  msg = self.bundle.get("date_only_search")
3809
- print(msg)
3810
- self.logger.warning(msg)
3811
- self.warning_counter.increment()
3826
+ self.__log_warning(msg)
3812
3827
 
3813
3828
  maybe_date = [k for k, v in valid_search_keys.items() if v in [SearchKey.DATE, SearchKey.DATETIME]]
3814
3829
  if (self.cv is None or self.cv == CVType.k_fold) and len(maybe_date) > 0 and not silent_mode:
3815
3830
  date_column = next(iter(maybe_date))
3816
3831
  if x[date_column].nunique() > 0.9 * _num_samples(x):
3817
3832
  msg = self.bundle.get("date_search_without_time_series")
3818
- print(msg)
3819
- self.logger.warning(msg)
3820
- self.warning_counter.increment()
3833
+ self.__log_warning(msg)
3821
3834
 
3822
3835
  if len(valid_search_keys) == 1:
3823
3836
  key, value = list(valid_search_keys.items())[0]
3824
3837
  # Show warning for country only if country is the only key
3825
3838
  if x[key].nunique() == 1:
3826
3839
  msg = self.bundle.get("single_constant_search_key").format(value, x[key].values[0])
3827
- print(msg)
3828
- self.logger.warning(msg)
3829
- self.warning_counter.increment()
3840
+ if not silent_mode:
3841
+ self.__log_warning(msg)
3830
3842
  # TODO maybe raise ValidationError
3831
3843
 
3832
3844
  self.logger.info(f"Prepared search keys: {valid_search_keys}")
@@ -3886,9 +3898,7 @@ class FeaturesEnricher(TransformerMixin):
3886
3898
  )
3887
3899
  else:
3888
3900
  msg = self.bundle.get("features_info_zero_important_features")
3889
- self.logger.warning(msg)
3890
- self.__display_support_link(msg)
3891
- self.warning_counter.increment()
3901
+ self.__log_warning(msg, show_support_link=True)
3892
3902
  except (ImportError, NameError):
3893
3903
  print(msg)
3894
3904
  print(self._internal_features_info)
@@ -3990,8 +4000,7 @@ class FeaturesEnricher(TransformerMixin):
3990
4000
  " But not used because not registered user"
3991
4001
  )
3992
4002
  if not silent_mode:
3993
- print(self.bundle.get("email_detected_not_registered").format(maybe_keys))
3994
- self.warning_counter.increment()
4003
+ self.__log_warning(self.bundle.get("email_detected_not_registered").format(maybe_keys))
3995
4004
 
3996
4005
  # if SearchKey.PHONE not in search_keys.values() and check_need_detect(SearchKey.PHONE):
3997
4006
  if check_need_detect(SearchKey.PHONE):
@@ -4010,8 +4019,7 @@ class FeaturesEnricher(TransformerMixin):
4010
4019
  "But not used because not registered user"
4011
4020
  )
4012
4021
  if not silent_mode:
4013
- print(self.bundle.get("phone_detected_not_registered"))
4014
- self.warning_counter.increment()
4022
+ self.__log_warning(self.bundle.get("phone_detected_not_registered"))
4015
4023
 
4016
4024
  return search_keys
4017
4025
 
@@ -4035,19 +4043,13 @@ class FeaturesEnricher(TransformerMixin):
4035
4043
  part2 = train[half_train:]
4036
4044
  train_psi = calculate_psi(part1[self.TARGET_NAME], part2[self.TARGET_NAME])
4037
4045
  if train_psi > 0.2:
4038
- self.warning_counter.increment()
4039
- msg = self.bundle.get("train_unstable_target").format(train_psi)
4040
- print(msg)
4041
- self.logger.warning(msg)
4046
+ self.__log_warning(self.bundle.get("train_unstable_target").format(train_psi))
4042
4047
 
4043
4048
  # 2. Check train-test PSI
4044
4049
  if eval1 is not None:
4045
4050
  train_test_psi = calculate_psi(train[self.TARGET_NAME], eval1[self.TARGET_NAME])
4046
4051
  if train_test_psi > 0.2:
4047
- self.warning_counter.increment()
4048
- msg = self.bundle.get("eval_unstable_target").format(train_test_psi)
4049
- print(msg)
4050
- self.logger.warning(msg)
4052
+ self.__log_warning(self.bundle.get("eval_unstable_target").format(train_test_psi))
4051
4053
 
4052
4054
  def _dump_python_libs(self):
4053
4055
  try:
@@ -4069,8 +4071,8 @@ class FeaturesEnricher(TransformerMixin):
4069
4071
  self.logger.warning(f"Showing support link: {link_text}")
4070
4072
  display(
4071
4073
  HTML(
4072
- f"""<br/>{link_text} <a href='{support_link}' target='_blank' rel='noopener noreferrer'>
4073
- here</a>"""
4074
+ f"""{link_text} <a href='{support_link}' target='_blank' rel='noopener noreferrer'>
4075
+ here</a><br/>"""
4074
4076
  )
4075
4077
  )
4076
4078
  except (ImportError, NameError):
@@ -745,20 +745,25 @@ class OtherEstimatorWrapper(EstimatorWrapper):
745
745
 
746
746
 
747
747
  def validate_scoring_argument(scoring: Union[Callable, str, None]):
748
- if isinstance(scoring, str) and scoring is not None:
748
+ if scoring is None:
749
+ return
750
+
751
+ if isinstance(scoring, str):
749
752
  _get_scorer_by_name(scoring)
750
- elif isinstance(scoring, Callable):
751
- spec = inspect.getfullargspec(scoring)
752
- if len(spec.args) < 3:
753
- raise ValidationError(
754
- f"Invalid scoring function passed {scoring}. It should accept 3 input arguments: estimator, x, y"
755
- )
756
- elif scoring is not None:
753
+ return
754
+
755
+ if not isinstance(scoring, Callable):
757
756
  raise ValidationError(
758
757
  f"Invalid scoring argument passed {scoring}. It should be string with scoring name or function"
759
758
  " that accepts 3 input arguments: estimator, x, y"
760
759
  )
761
760
 
761
+ spec = inspect.getfullargspec(scoring)
762
+ if len(spec.args) < 3:
763
+ raise ValidationError(
764
+ f"Invalid scoring function passed {scoring}. It should accept 3 input arguments: estimator, x, y"
765
+ )
766
+
762
767
 
763
768
  def _get_scorer_by_name(scoring: str) -> Tuple[Callable, str, int]:
764
769
  metric_name = scoring
@@ -26,7 +26,6 @@ from upgini.resource_bundle import ResourceBundle, get_custom_bundle
26
26
  from upgini.utils import find_numbers_with_decimal_comma
27
27
  from upgini.utils.datetime_utils import DateTimeSearchKeyConverter
28
28
  from upgini.utils.phone_utils import PhoneSearchKeyConverter
29
- from upgini.utils.warning_counter import WarningCounter
30
29
 
31
30
 
32
31
  class Normalizer:
@@ -37,16 +36,13 @@ class Normalizer:
37
36
  self,
38
37
  bundle: ResourceBundle = None,
39
38
  logger: Logger = None,
40
- warnings_counter: WarningCounter = None,
41
- silent_mode=False,
42
39
  ):
43
40
  self.bundle = bundle or get_custom_bundle()
44
41
  self.logger = logger or getLogger()
45
- self.warnings_counter = warnings_counter or WarningCounter()
46
- self.silent_mode = silent_mode
47
42
  self.columns_renaming = {}
48
43
  self.search_keys = {}
49
44
  self.generated_features = []
45
+ self.removed_features = []
50
46
 
51
47
  def normalize(
52
48
  self, df: pd.DataFrame, search_keys: Dict[str, SearchKey], generated_features: List[str]
@@ -139,19 +135,11 @@ class Normalizer:
139
135
  def _remove_dates_from_features(self, df: pd.DataFrame):
140
136
  features = self._get_features(df)
141
137
 
142
- removed_features = []
143
138
  for f in features:
144
139
  if is_datetime(df[f]) or isinstance(df[f].dtype, pd.PeriodDtype):
145
- removed_features.append(f)
140
+ self.removed_features.append(f)
146
141
  df.drop(columns=f, inplace=True)
147
142
 
148
- if removed_features:
149
- msg = self.bundle.get("dataset_date_features").format(removed_features)
150
- self.logger.warning(msg)
151
- if not self.silent_mode:
152
- print(msg)
153
- self.warnings_counter.increment()
154
-
155
143
  return df
156
144
 
157
145
  def _cut_too_long_string_values(self, df: pd.DataFrame):