upgini 1.1.309__py3-none-any.whl → 1.1.309a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

@@ -61,11 +61,15 @@ from upgini.metadata import (
61
61
  SearchKey,
62
62
  )
63
63
  from upgini.metrics import EstimatorWrapper, validate_scoring_argument
64
+ from upgini.normalizer.normalize_utils import Normalizer
64
65
  from upgini.resource_bundle import ResourceBundle, bundle, get_custom_bundle
65
66
  from upgini.search_task import SearchTask
66
67
  from upgini.spinner import Spinner
67
68
  from upgini.utils import combine_search_keys, find_numbers_with_decimal_comma
68
- from upgini.utils.country_utils import CountrySearchKeyDetector
69
+ from upgini.utils.country_utils import (
70
+ CountrySearchKeyConverter,
71
+ CountrySearchKeyDetector,
72
+ )
69
73
  from upgini.utils.custom_loss_utils import (
70
74
  get_additional_params_custom_loss,
71
75
  get_runtime_params_custom_loss,
@@ -90,8 +94,12 @@ from upgini.utils.display_utils import (
90
94
  from upgini.utils.email_utils import EmailSearchKeyConverter, EmailSearchKeyDetector
91
95
  from upgini.utils.features_validator import FeaturesValidator
92
96
  from upgini.utils.format import Format
93
- from upgini.utils.phone_utils import PhoneSearchKeyDetector
94
- from upgini.utils.postal_code_utils import PostalCodeSearchKeyDetector
97
+ from upgini.utils.ip_utils import IpSearchKeyConverter
98
+ from upgini.utils.phone_utils import PhoneSearchKeyConverter, PhoneSearchKeyDetector
99
+ from upgini.utils.postal_code_utils import (
100
+ PostalCodeSearchKeyConverter,
101
+ PostalCodeSearchKeyDetector,
102
+ )
95
103
 
96
104
  try:
97
105
  from upgini.utils.progress_bar import CustomProgressBar as ProgressBar
@@ -237,6 +245,7 @@ class FeaturesEnricher(TransformerMixin):
237
245
 
238
246
  self.passed_features: List[str] = []
239
247
  self.df_with_original_index: Optional[pd.DataFrame] = None
248
+ self.fit_columns_renaming: Optional[Dict[str, str]] = None
240
249
  self.country_added = False
241
250
  self.fit_generated_features: List[str] = []
242
251
  self.fit_dropped_features: Set[str] = set()
@@ -247,7 +256,7 @@ class FeaturesEnricher(TransformerMixin):
247
256
  self.eval_set: Optional[List[Tuple]] = None
248
257
  self.autodetected_search_keys: Dict[str, SearchKey] = {}
249
258
  self.imbalanced = False
250
- self.__cached_sampled_datasets: Optional[Tuple[pd.DataFrame, pd.DataFrame, pd.Series, Dict, Dict]] = None
259
+ self.__cached_sampled_datasets: Optional[Tuple[pd.DataFrame, pd.DataFrame, pd.Series, Dict, Dict, Dict]] = None
251
260
 
252
261
  validate_version(self.logger)
253
262
  self.search_keys = search_keys or {}
@@ -706,7 +715,7 @@ class FeaturesEnricher(TransformerMixin):
706
715
 
707
716
  start_time = time.time()
708
717
  try:
709
- result = self.__inner_transform(
718
+ result, _ = self.__inner_transform(
710
719
  trace_id,
711
720
  X,
712
721
  exclude_features_sources=exclude_features_sources,
@@ -906,8 +915,16 @@ class FeaturesEnricher(TransformerMixin):
906
915
  search_keys,
907
916
  groups,
908
917
  _cv,
918
+ columns_renaming,
909
919
  ) = prepared_data
910
920
 
921
+ # rename cat_features
922
+ if cat_features:
923
+ for new_c, old_c in columns_renaming.items():
924
+ if old_c in cat_features:
925
+ cat_features.remove(old_c)
926
+ cat_features.append(new_c)
927
+
911
928
  gc.collect()
912
929
 
913
930
  print(self.bundle.get("metrics_start"))
@@ -920,7 +937,7 @@ class FeaturesEnricher(TransformerMixin):
920
937
 
921
938
  self._check_train_and_eval_target_distribution(y_sorted, fitting_eval_set_dict)
922
939
 
923
- has_date = self._get_date_column(search_keys) is not None
940
+ has_date = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME]) is not None
924
941
  model_task_type = self.model_task_type or define_task(y_sorted, has_date, self.logger, silent=True)
925
942
 
926
943
  wrapper = EstimatorWrapper.create(
@@ -1113,7 +1130,7 @@ class FeaturesEnricher(TransformerMixin):
1113
1130
  )
1114
1131
 
1115
1132
  uplift_col = self.bundle.get("quality_metrics_uplift_header")
1116
- date_column = self._get_date_column(search_keys)
1133
+ date_column = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
1117
1134
  if (
1118
1135
  uplift_col in metrics_df.columns
1119
1136
  and (metrics_df[uplift_col] < 0).any()
@@ -1201,9 +1218,11 @@ class FeaturesEnricher(TransformerMixin):
1201
1218
 
1202
1219
  extended_X = x.copy()
1203
1220
  generated_features = []
1204
- date_column = self._get_date_column(search_keys)
1221
+ date_column = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
1205
1222
  if date_column is not None:
1206
- converter = DateTimeSearchKeyConverter(date_column, self.date_format, self.logger, self.bundle)
1223
+ converter = DateTimeSearchKeyConverter(
1224
+ date_column, self.date_format, self.logger, self.bundle, silent_mode=True
1225
+ )
1207
1226
  extended_X = converter.convert(extended_X, keep_time=True)
1208
1227
  generated_features.extend(converter.generated_features)
1209
1228
  email_column = self._get_email_column(search_keys)
@@ -1259,7 +1278,7 @@ class FeaturesEnricher(TransformerMixin):
1259
1278
  groups = None
1260
1279
 
1261
1280
  if not isinstance(_cv, BaseCrossValidator):
1262
- date_column = self._get_date_column(search_keys)
1281
+ date_column = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
1263
1282
  date_series = X[date_column] if date_column is not None else None
1264
1283
  _cv, groups = CVConfig(
1265
1284
  _cv, date_series, self.random_state, self._search_task.get_shuffle_kfold(), group_columns=group_columns
@@ -1282,7 +1301,7 @@ class FeaturesEnricher(TransformerMixin):
1282
1301
 
1283
1302
  def _get_client_cat_features(
1284
1303
  self, estimator: Optional[Any], X: pd.DataFrame, search_keys: Dict[str, SearchKey]
1285
- ) -> Optional[List[str]]:
1304
+ ) -> Tuple[Optional[List[str]], List[str]]:
1286
1305
  cat_features = None
1287
1306
  search_keys_for_metrics = []
1288
1307
  if (
@@ -1342,11 +1361,15 @@ class FeaturesEnricher(TransformerMixin):
1342
1361
  progress_bar,
1343
1362
  progress_callback,
1344
1363
  )
1345
- X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys = dataclasses.astuple(sampled_data)
1364
+ X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys, columns_renaming = dataclasses.astuple(
1365
+ sampled_data
1366
+ )
1346
1367
 
1347
1368
  excluding_search_keys = list(search_keys.keys())
1348
1369
  if search_keys_for_metrics is not None and len(search_keys_for_metrics) > 0:
1349
- excluding_search_keys = [sk for sk in excluding_search_keys if sk not in search_keys_for_metrics]
1370
+ for sk in excluding_search_keys:
1371
+ if columns_renaming.get(sk) in search_keys_for_metrics:
1372
+ excluding_search_keys.remove(sk)
1350
1373
 
1351
1374
  client_features = [
1352
1375
  c
@@ -1392,6 +1415,7 @@ class FeaturesEnricher(TransformerMixin):
1392
1415
  fitting_X = fitting_X.drop(columns=constant_columns, errors="ignore")
1393
1416
  fitting_enriched_X = fitting_enriched_X.drop(columns=constant_columns, errors="ignore")
1394
1417
 
1418
+ # TODO maybe there is no more need for these convertions
1395
1419
  # Remove datetime features
1396
1420
  datetime_features = [
1397
1421
  f for f in fitting_X.columns if is_datetime64_any_dtype(fitting_X[f]) or is_period_dtype(fitting_X[f])
@@ -1479,6 +1503,7 @@ class FeaturesEnricher(TransformerMixin):
1479
1503
  search_keys,
1480
1504
  groups,
1481
1505
  cv,
1506
+ columns_renaming,
1482
1507
  )
1483
1508
 
1484
1509
  @dataclass
@@ -1488,6 +1513,7 @@ class FeaturesEnricher(TransformerMixin):
1488
1513
  enriched_X: pd.DataFrame
1489
1514
  eval_set_sampled_dict: Dict[int, Tuple[pd.DataFrame, pd.Series]]
1490
1515
  search_keys: Dict[str, SearchKey]
1516
+ columns_renaming: Dict[str, str]
1491
1517
 
1492
1518
  def _sample_data_for_metrics(
1493
1519
  self,
@@ -1527,11 +1553,15 @@ class FeaturesEnricher(TransformerMixin):
1527
1553
  )
1528
1554
 
1529
1555
  def __get_sampled_cached_enriched(self, exclude_features_sources: Optional[List[str]]) -> _SampledDataForMetrics:
1530
- X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys = self.__cached_sampled_datasets
1556
+ X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys, columns_renaming = (
1557
+ self.__cached_sampled_datasets
1558
+ )
1531
1559
  if exclude_features_sources:
1532
1560
  enriched_X = enriched_X.drop(columns=exclude_features_sources, errors="ignore")
1533
1561
 
1534
- return self.__mk_sampled_data_tuple(X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys)
1562
+ return self.__mk_sampled_data_tuple(
1563
+ X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys, columns_renaming
1564
+ )
1535
1565
 
1536
1566
  def __sample_only_input(
1537
1567
  self, validated_X: pd.DataFrame, validated_y: pd.Series, eval_set: Optional[List[tuple]], is_demo_dataset: bool
@@ -1549,6 +1579,22 @@ class FeaturesEnricher(TransformerMixin):
1549
1579
  eval_xy[EVAL_SET_INDEX] = idx + 1
1550
1580
  df = pd.concat([df, eval_xy])
1551
1581
 
1582
+ search_keys = self.search_keys.copy()
1583
+ search_keys = self.__prepare_search_keys(df, search_keys, is_demo_dataset, is_transform=True, silent_mode=True)
1584
+
1585
+ date_column = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
1586
+ generated_features = []
1587
+ if date_column is not None:
1588
+ converter = DateTimeSearchKeyConverter(
1589
+ date_column, self.date_format, self.logger, self.bundle, silent_mode=True
1590
+ )
1591
+ df = converter.convert(df, keep_time=True)
1592
+ generated_features = converter.generated_features
1593
+
1594
+ normalizer = Normalizer(self.search_keys, generated_features, self.bundle, self.logger, self.warning_counter)
1595
+ df = normalizer.normalize(df)
1596
+ columns_renaming = normalizer.columns_renaming
1597
+
1552
1598
  df = clean_full_duplicates(df, logger=self.logger, silent=True, bundle=self.bundle)
1553
1599
 
1554
1600
  num_samples = _num_samples(df)
@@ -1561,24 +1607,41 @@ class FeaturesEnricher(TransformerMixin):
1561
1607
  self.logger.info(f"Downsampling from {num_samples} to {sample_rows}")
1562
1608
  df = df.sample(n=sample_rows, random_state=self.random_state)
1563
1609
 
1564
- df_extended, search_keys = self._extend_x(df, is_demo_dataset)
1565
- df_extended = self.__add_fit_system_record_id(df_extended, {}, search_keys, SYSTEM_RECORD_ID)
1610
+ email_column = self._get_email_column(search_keys)
1611
+ hem_column = self._get_hem_column(search_keys)
1612
+ if email_column:
1613
+ converter = EmailSearchKeyConverter(
1614
+ email_column, hem_column, search_keys, columns_renaming, [], self.bundle, self.logger
1615
+ )
1616
+ df = converter.convert(df)
1617
+ df = self.__add_fit_system_record_id(df, search_keys, SYSTEM_RECORD_ID)
1618
+ if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
1619
+ df = df.drop(columns=DateTimeSearchKeyConverter.DATETIME_COL)
1566
1620
 
1567
- train_df = df_extended.query(f"{EVAL_SET_INDEX} == 0") if eval_set is not None else df_extended
1621
+ train_df = df.query(f"{EVAL_SET_INDEX} == 0") if eval_set is not None else df
1568
1622
  X_sampled = train_df.drop(columns=[TARGET, EVAL_SET_INDEX], errors="ignore")
1569
1623
  y_sampled = train_df[TARGET].copy()
1570
1624
  enriched_X = X_sampled
1571
1625
 
1572
1626
  if eval_set is not None:
1573
1627
  for idx in range(len(eval_set)):
1574
- eval_xy_sampled = df_extended.query(f"{EVAL_SET_INDEX} == {idx + 1}")
1628
+ eval_xy_sampled = df.query(f"{EVAL_SET_INDEX} == {idx + 1}")
1575
1629
  eval_X_sampled = eval_xy_sampled.drop(columns=[TARGET, EVAL_SET_INDEX], errors="ignore")
1576
1630
  eval_y_sampled = eval_xy_sampled[TARGET].copy()
1577
1631
  enriched_eval_X = eval_X_sampled
1578
1632
  eval_set_sampled_dict[idx] = (eval_X_sampled, enriched_eval_X, eval_y_sampled)
1579
- self.__cached_sampled_datasets = (X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys)
1633
+ self.__cached_sampled_datasets = (
1634
+ X_sampled,
1635
+ y_sampled,
1636
+ enriched_X,
1637
+ eval_set_sampled_dict,
1638
+ search_keys,
1639
+ columns_renaming,
1640
+ )
1580
1641
 
1581
- return self.__mk_sampled_data_tuple(X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys)
1642
+ return self.__mk_sampled_data_tuple(
1643
+ X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys, columns_renaming
1644
+ )
1582
1645
 
1583
1646
  def __sample_balanced(
1584
1647
  self,
@@ -1590,7 +1653,7 @@ class FeaturesEnricher(TransformerMixin):
1590
1653
  search_keys = self.fit_search_keys
1591
1654
 
1592
1655
  rows_to_drop = None
1593
- has_date = self._get_date_column(search_keys) is not None
1656
+ has_date = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME]) is not None
1594
1657
  task_type = self.model_task_type or define_task(
1595
1658
  self.df_with_original_index[TARGET], has_date, self.logger, silent=True
1596
1659
  )
@@ -1644,9 +1707,18 @@ class FeaturesEnricher(TransformerMixin):
1644
1707
  enriched_eval_X = enriched_eval_sets[idx + 1].drop(columns=[TARGET, EVAL_SET_INDEX])
1645
1708
  eval_set_sampled_dict[idx] = (eval_X_sampled, enriched_eval_X, eval_y_sampled)
1646
1709
 
1647
- self.__cached_sampled_datasets = (X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys)
1710
+ self.__cached_sampled_datasets = (
1711
+ X_sampled,
1712
+ y_sampled,
1713
+ enriched_X,
1714
+ eval_set_sampled_dict,
1715
+ search_keys,
1716
+ self.fit_columns_renaming,
1717
+ )
1648
1718
 
1649
- return self.__mk_sampled_data_tuple(X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys)
1719
+ return self.__mk_sampled_data_tuple(
1720
+ X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys, self.fit_columns_renaming
1721
+ )
1650
1722
 
1651
1723
  def __sample_imbalanced(
1652
1724
  self,
@@ -1686,7 +1758,7 @@ class FeaturesEnricher(TransformerMixin):
1686
1758
  tmp_target_name = "__target"
1687
1759
  df = df.rename(columns={TARGET: tmp_target_name})
1688
1760
 
1689
- enriched_df = self.__inner_transform(
1761
+ enriched_df, columns_renaming = self.__inner_transform(
1690
1762
  trace_id,
1691
1763
  df,
1692
1764
  exclude_features_sources=exclude_features_sources,
@@ -1734,7 +1806,7 @@ class FeaturesEnricher(TransformerMixin):
1734
1806
  tmp_target_name = "__target"
1735
1807
  df = df.rename(columns={TARGET: tmp_target_name})
1736
1808
 
1737
- enriched_Xy = self.__inner_transform(
1809
+ enriched_Xy, columns_renaming = self.__inner_transform(
1738
1810
  trace_id,
1739
1811
  df,
1740
1812
  exclude_features_sources=exclude_features_sources,
@@ -1759,9 +1831,18 @@ class FeaturesEnricher(TransformerMixin):
1759
1831
  y_sampled = enriched_Xy[TARGET].copy()
1760
1832
  enriched_X = enriched_Xy.drop(columns=TARGET)
1761
1833
 
1762
- self.__cached_sampled_datasets = (X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, self.search_keys)
1834
+ self.__cached_sampled_datasets = (
1835
+ X_sampled,
1836
+ y_sampled,
1837
+ enriched_X,
1838
+ eval_set_sampled_dict,
1839
+ self.search_keys,
1840
+ columns_renaming,
1841
+ )
1763
1842
 
1764
- return self.__mk_sampled_data_tuple(X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, self.search_keys)
1843
+ return self.__mk_sampled_data_tuple(
1844
+ X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, self.search_keys, columns_renaming
1845
+ )
1765
1846
 
1766
1847
  def __mk_sampled_data_tuple(
1767
1848
  self,
@@ -1770,6 +1851,7 @@ class FeaturesEnricher(TransformerMixin):
1770
1851
  enriched_X: pd.DataFrame,
1771
1852
  eval_set_sampled_dict: Dict,
1772
1853
  search_keys: Dict,
1854
+ columns_renaming: Dict[str, str],
1773
1855
  ):
1774
1856
  search_keys = {k: v for k, v in search_keys.items() if k in X_sampled.columns.to_list()}
1775
1857
  return FeaturesEnricher._SampledDataForMetrics(
@@ -1778,6 +1860,7 @@ class FeaturesEnricher(TransformerMixin):
1778
1860
  enriched_X=enriched_X,
1779
1861
  eval_set_sampled_dict=eval_set_sampled_dict,
1780
1862
  search_keys=search_keys,
1863
+ columns_renaming=columns_renaming,
1781
1864
  )
1782
1865
 
1783
1866
  def get_search_id(self) -> Optional[str]:
@@ -1866,7 +1949,7 @@ class FeaturesEnricher(TransformerMixin):
1866
1949
  progress_bar: Optional[ProgressBar] = None,
1867
1950
  progress_callback: Optional[Callable[[SearchProgress], Any]] = None,
1868
1951
  add_fit_system_record_id: bool = False,
1869
- ) -> pd.DataFrame:
1952
+ ) -> Tuple[pd.DataFrame, Dict[str, str]]:
1870
1953
  if self._search_task is None:
1871
1954
  raise NotFittedError(self.bundle.get("transform_unfitted_enricher"))
1872
1955
 
@@ -1879,13 +1962,13 @@ class FeaturesEnricher(TransformerMixin):
1879
1962
 
1880
1963
  if len(self.feature_names_) == 0:
1881
1964
  self.logger.warning(self.bundle.get("no_important_features_for_transform"))
1882
- return X
1965
+ return X, {c: c for c in X.columns}
1883
1966
 
1884
1967
  if self._has_paid_features(exclude_features_sources):
1885
1968
  msg = self.bundle.get("transform_with_paid_features")
1886
1969
  self.logger.warning(msg)
1887
1970
  self.__display_support_link(msg)
1888
- return None
1971
+ return None, {c: c for c in X.columns}
1889
1972
 
1890
1973
  if not metrics_calculation:
1891
1974
  transform_usage = self.rest_client.get_current_transform_usage(trace_id)
@@ -1896,7 +1979,7 @@ class FeaturesEnricher(TransformerMixin):
1896
1979
  self.logger.warning(msg)
1897
1980
  print(msg)
1898
1981
  show_request_quote_button()
1899
- return None
1982
+ return None, {c: c for c in X.columns}
1900
1983
  else:
1901
1984
  msg = self.bundle.get("transform_usage_info").format(
1902
1985
  transform_usage.limit, transform_usage.transformed_rows
@@ -1934,9 +2017,11 @@ class FeaturesEnricher(TransformerMixin):
1934
2017
  df = self.__add_country_code(df, search_keys)
1935
2018
 
1936
2019
  generated_features = []
1937
- date_column = self._get_date_column(search_keys)
2020
+ date_column = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
1938
2021
  if date_column is not None:
1939
- converter = DateTimeSearchKeyConverter(date_column, self.date_format, self.logger, bundle=self.bundle)
2022
+ converter = DateTimeSearchKeyConverter(
2023
+ date_column, self.date_format, self.logger, bundle=self.bundle, silent_mode=silent_mode
2024
+ )
1940
2025
  df = converter.convert(df)
1941
2026
  self.logger.info(f"Date column after convertion: {df[date_column]}")
1942
2027
  generated_features.extend(converter.generated_features)
@@ -1945,61 +2030,87 @@ class FeaturesEnricher(TransformerMixin):
1945
2030
  if self.add_date_if_missing:
1946
2031
  df = self._add_current_date_as_key(df, search_keys, self.logger, self.bundle)
1947
2032
 
2033
+ normalizer = Normalizer(
2034
+ search_keys, generated_features, self.bundle, self.logger, self.warning_counter, silent_mode
2035
+ )
2036
+ df = normalizer.normalize(df)
2037
+ columns_renaming = normalizer.columns_renaming
2038
+
1948
2039
  # Don't pass all features in backend on transform
1949
- original_features_for_transform = []
1950
2040
  runtime_parameters = self._get_copy_of_runtime_parameters()
1951
- features_not_to_pass = [column for column in df.columns if column not in search_keys.keys()]
1952
- if len(features_not_to_pass) > 0:
1953
- # Pass only features that need for transform
1954
- features_for_transform = self._search_task.get_features_for_transform()
1955
- if features_for_transform is not None and len(features_for_transform) > 0:
1956
- file_metadata = self._search_task.get_file_metadata(trace_id)
1957
- original_features_for_transform = [
1958
- c.originalName or c.name for c in file_metadata.columns if c.name in features_for_transform
1959
- ]
1960
-
1961
- runtime_parameters.properties["features_for_embeddings"] = ",".join(features_for_transform)
2041
+ features_for_transform = self._search_task.get_features_for_transform() or []
2042
+ if len(features_for_transform) > 0:
2043
+ runtime_parameters.properties["features_for_embeddings"] = ",".join(features_for_transform)
1962
2044
 
1963
- columns_for_system_record_id = sorted(list(search_keys.keys()) + (original_features_for_transform))
2045
+ columns_for_system_record_id = sorted(list(search_keys.keys()) + features_for_transform)
1964
2046
 
1965
2047
  df[ENTITY_SYSTEM_RECORD_ID] = pd.util.hash_pandas_object(
1966
2048
  df[columns_for_system_record_id], index=False
1967
2049
  ).astype("Float64")
1968
2050
 
1969
2051
  # Explode multiple search keys
1970
- df, unnest_search_keys = self._explode_multiple_search_keys(df, search_keys)
2052
+ df, unnest_search_keys = self._explode_multiple_search_keys(df, search_keys, columns_renaming)
1971
2053
 
1972
2054
  email_column = self._get_email_column(search_keys)
1973
2055
  hem_column = self._get_hem_column(search_keys)
1974
- email_converted_to_hem = False
2056
+ # email_converted_to_hem = False
1975
2057
  if email_column:
1976
2058
  converter = EmailSearchKeyConverter(
1977
- email_column, hem_column, search_keys, list(unnest_search_keys.keys()), self.logger
2059
+ email_column,
2060
+ hem_column,
2061
+ search_keys,
2062
+ columns_renaming,
2063
+ list(unnest_search_keys.keys()),
2064
+ self.logger,
1978
2065
  )
1979
2066
  df = converter.convert(df)
1980
2067
  generated_features.extend(converter.generated_features)
1981
- email_converted_to_hem = converter.email_converted_to_hem
2068
+
2069
+ ip_column = self._get_ip_column(search_keys)
2070
+ if ip_column:
2071
+ converter = IpSearchKeyConverter(
2072
+ ip_column,
2073
+ search_keys,
2074
+ columns_renaming,
2075
+ list(unnest_search_keys.keys()),
2076
+ self.bundle,
2077
+ self.logger,
2078
+ )
2079
+ df = converter.convert(df)
2080
+
2081
+ phone_column = self._get_phone_column(search_keys)
2082
+ country_column = self._get_country_column(search_keys)
2083
+ if phone_column:
2084
+ converter = PhoneSearchKeyConverter(phone_column, country_column)
2085
+ df = converter.convert(df)
2086
+
2087
+ if country_column:
2088
+ converter = CountrySearchKeyConverter(country_column)
2089
+ df = converter.convert(df)
2090
+
2091
+ postal_code = self._get_postal_column(search_keys)
2092
+ if postal_code:
2093
+ converter = PostalCodeSearchKeyConverter(postal_code)
2094
+ df = converter.convert(df)
2095
+
1982
2096
  generated_features = [f for f in generated_features if f in self.fit_generated_features]
1983
2097
 
1984
2098
  meaning_types = {col: key.value for col, key in search_keys.items()}
1985
- # non_keys_columns = [column for column in df.columns if column not in search_keys.keys()]
1986
- for col in original_features_for_transform:
2099
+ for col in features_for_transform:
1987
2100
  meaning_types[col] = FileColumnMeaningType.FEATURE
1988
- features_not_to_pass = [column for column in features_not_to_pass if column not in search_keys.keys()]
1989
-
1990
- if email_converted_to_hem:
1991
- features_not_to_pass.append(email_column)
1992
-
1993
- features_not_to_pass = [c for c in features_not_to_pass if c not in original_features_for_transform]
1994
- columns_for_system_record_id = sorted(list(search_keys.keys()) + (original_features_for_transform))
2101
+ features_not_to_pass = [
2102
+ c for c in df.columns if c not in search_keys.keys() and c not in features_for_transform
2103
+ ]
1995
2104
 
1996
2105
  if add_fit_system_record_id:
1997
- df = self.__add_fit_system_record_id(df, {}, search_keys, SYSTEM_RECORD_ID)
2106
+ df = self.__add_fit_system_record_id(df, search_keys, SYSTEM_RECORD_ID)
2107
+ if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
2108
+ df = df.drop(columns=DateTimeSearchKeyConverter.DATETIME_COL)
1998
2109
  df = df.rename(columns={SYSTEM_RECORD_ID: SORT_ID})
1999
2110
  features_not_to_pass.append(SORT_ID)
2000
2111
 
2001
- columns_for_system_record_id = sorted(list(search_keys.keys()) + (original_features_for_transform or []))
2002
-
2112
+ # search keys might be changed after explode
2113
+ columns_for_system_record_id = sorted(list(search_keys.keys()) + features_for_transform)
2003
2114
  df[SYSTEM_RECORD_ID] = pd.util.hash_pandas_object(df[columns_for_system_record_id], index=False).astype(
2004
2115
  "Float64"
2005
2116
  )
@@ -2035,8 +2146,7 @@ class FeaturesEnricher(TransformerMixin):
2035
2146
  rest_client=self.rest_client,
2036
2147
  logger=self.logger,
2037
2148
  )
2038
- if email_converted_to_hem:
2039
- dataset.ignore_columns = [email_column]
2149
+ dataset.columns_renaming = columns_renaming
2040
2150
 
2041
2151
  if max_features is not None or importance_threshold is not None:
2042
2152
  exclude_features_sources = list(
@@ -2138,7 +2248,7 @@ class FeaturesEnricher(TransformerMixin):
2138
2248
  if add_fit_system_record_id:
2139
2249
  result = result.rename(columns={SORT_ID: SYSTEM_RECORD_ID})
2140
2250
 
2141
- return result
2251
+ return result, columns_renaming
2142
2252
 
2143
2253
  def _get_excluded_features(self, max_features: Optional[int], importance_threshold: Optional[float]) -> List[str]:
2144
2254
  features_info = self._internal_features_info
@@ -2239,6 +2349,9 @@ class FeaturesEnricher(TransformerMixin):
2239
2349
  self.df_with_original_index = None
2240
2350
  self.__cached_sampled_datasets = None
2241
2351
  self.metrics = None
2352
+ self.fit_columns_renaming = None
2353
+ self.fit_dropped_features = set()
2354
+ self.fit_generated_features = []
2242
2355
 
2243
2356
  validated_X = self._validate_X(X)
2244
2357
  validated_y = self._validate_y(validated_X, y)
@@ -2285,9 +2398,10 @@ class FeaturesEnricher(TransformerMixin):
2285
2398
  self.fit_search_keys = self.search_keys.copy()
2286
2399
  self.fit_search_keys = self.__prepare_search_keys(validated_X, self.fit_search_keys, is_demo_dataset)
2287
2400
 
2288
- maybe_date_column = self._get_date_column(self.fit_search_keys)
2401
+ maybe_date_column = SearchKey.find_key(self.fit_search_keys, [SearchKey.DATE, SearchKey.DATETIME])
2289
2402
  has_date = maybe_date_column is not None
2290
2403
  model_task_type = self.model_task_type or define_task(validated_y, has_date, self.logger)
2404
+
2291
2405
  self._validate_binary_observations(validated_y, model_task_type)
2292
2406
 
2293
2407
  self.runtime_parameters = get_runtime_params_custom_loss(
@@ -2317,7 +2431,13 @@ class FeaturesEnricher(TransformerMixin):
2317
2431
  self.fit_generated_features = []
2318
2432
 
2319
2433
  if has_date:
2320
- converter = DateTimeSearchKeyConverter(maybe_date_column, self.date_format, self.logger, bundle=self.bundle)
2434
+ converter = DateTimeSearchKeyConverter(
2435
+ maybe_date_column,
2436
+ self.date_format,
2437
+ self.logger,
2438
+ bundle=self.bundle,
2439
+ warnings_counter=self.warning_counter,
2440
+ )
2321
2441
  df = converter.convert(df, keep_time=True)
2322
2442
  self.logger.info(f"Date column after convertion: {df[maybe_date_column]}")
2323
2443
  self.fit_generated_features.extend(converter.generated_features)
@@ -2334,7 +2454,12 @@ class FeaturesEnricher(TransformerMixin):
2334
2454
 
2335
2455
  self.__adjust_cv(df, maybe_date_column, model_task_type)
2336
2456
 
2337
- # TODO normalize and convert all columns
2457
+ normalizer = Normalizer(
2458
+ self.fit_search_keys, self.fit_generated_features, self.bundle, self.logger, self.warning_counter
2459
+ )
2460
+ df = normalizer.normalize(df)
2461
+ columns_renaming = normalizer.columns_renaming
2462
+ self.fit_columns_renaming = columns_renaming
2338
2463
 
2339
2464
  df = remove_fintech_duplicates(
2340
2465
  df, self.fit_search_keys, date_format=self.date_format, logger=self.logger, bundle=self.bundle
@@ -2342,38 +2467,59 @@ class FeaturesEnricher(TransformerMixin):
2342
2467
  df = clean_full_duplicates(df, self.logger, bundle=self.bundle)
2343
2468
 
2344
2469
  # Explode multiple search keys
2345
- non_feature_columns = [self.TARGET_NAME, EVAL_SET_INDEX] + list(self.fit_search_keys.keys())
2346
- meaning_types = {
2347
- **{col: key.value for col, key in self.fit_search_keys.items()},
2348
- **{str(c): FileColumnMeaningType.FEATURE for c in df.columns if c not in non_feature_columns},
2349
- }
2350
- meaning_types[self.TARGET_NAME] = FileColumnMeaningType.TARGET
2351
- if eval_set is not None and len(eval_set) > 0:
2352
- meaning_types[EVAL_SET_INDEX] = FileColumnMeaningType.EVAL_SET_INDEX
2353
- df = self.__add_fit_system_record_id(df, meaning_types, self.fit_search_keys, ENTITY_SYSTEM_RECORD_ID)
2470
+ df = self.__add_fit_system_record_id(df, self.fit_search_keys, ENTITY_SYSTEM_RECORD_ID)
2354
2471
 
2355
2472
  # TODO check that this is correct for enrichment
2356
2473
  self.df_with_original_index = df.copy()
2474
+ # TODO check maybe need to drop _time column from df_with_original_index
2357
2475
 
2358
- df, unnest_search_keys = self._explode_multiple_search_keys(df, self.fit_search_keys)
2476
+ df, unnest_search_keys = self._explode_multiple_search_keys(df, self.fit_search_keys, columns_renaming)
2359
2477
 
2360
2478
  # Convert EMAIL to HEM after unnesting to do it only with one column
2361
2479
  email_column = self._get_email_column(self.fit_search_keys)
2362
2480
  hem_column = self._get_hem_column(self.fit_search_keys)
2363
- email_converted_to_hem = False
2364
2481
  if email_column:
2365
2482
  converter = EmailSearchKeyConverter(
2366
- email_column, hem_column, self.fit_search_keys, list(unnest_search_keys.keys()), self.logger
2483
+ email_column,
2484
+ hem_column,
2485
+ self.fit_search_keys,
2486
+ columns_renaming,
2487
+ list(unnest_search_keys.keys()),
2488
+ self.logger,
2367
2489
  )
2368
2490
  df = converter.convert(df)
2369
2491
  self.fit_generated_features.extend(converter.generated_features)
2370
- email_converted_to_hem = converter.email_converted_to_hem
2492
+
2493
+ ip_column = self._get_ip_column(self.fit_search_keys)
2494
+ if ip_column:
2495
+ converter = IpSearchKeyConverter(
2496
+ ip_column,
2497
+ self.fit_search_keys,
2498
+ columns_renaming,
2499
+ list(unnest_search_keys.keys()),
2500
+ self.bundle,
2501
+ self.logger,
2502
+ )
2503
+ df = converter.convert(df)
2504
+
2505
+ phone_column = self._get_phone_column(self.fit_search_keys)
2506
+ country_column = self._get_country_column(self.fit_search_keys)
2507
+ if phone_column:
2508
+ converter = PhoneSearchKeyConverter(phone_column, country_column)
2509
+ df = converter.convert(df)
2510
+
2511
+ if country_column:
2512
+ converter = CountrySearchKeyConverter(country_column)
2513
+ df = converter.convert(df)
2514
+
2515
+ postal_code = self._get_postal_column(self.fit_search_keys)
2516
+ if postal_code:
2517
+ converter = PostalCodeSearchKeyConverter(postal_code)
2518
+ df = converter.convert(df)
2371
2519
 
2372
2520
  non_feature_columns = [self.TARGET_NAME, EVAL_SET_INDEX, ENTITY_SYSTEM_RECORD_ID, SEARCH_KEY_UNNEST] + list(
2373
2521
  self.fit_search_keys.keys()
2374
2522
  )
2375
- if email_converted_to_hem:
2376
- non_feature_columns.append(email_column)
2377
2523
  if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
2378
2524
  non_feature_columns.append(DateTimeSearchKeyConverter.DATETIME_COL)
2379
2525
 
@@ -2385,9 +2531,6 @@ class FeaturesEnricher(TransformerMixin):
2385
2531
  self.fit_dropped_features.update(features_to_drop)
2386
2532
  df = df.drop(columns=features_to_drop)
2387
2533
 
2388
- if email_converted_to_hem:
2389
- self.fit_dropped_features.add(email_column)
2390
-
2391
2534
  self.fit_generated_features = [f for f in self.fit_generated_features if f not in self.fit_dropped_features]
2392
2535
 
2393
2536
  meaning_types = {
@@ -2401,7 +2544,12 @@ class FeaturesEnricher(TransformerMixin):
2401
2544
  if eval_set is not None and len(eval_set) > 0:
2402
2545
  meaning_types[EVAL_SET_INDEX] = FileColumnMeaningType.EVAL_SET_INDEX
2403
2546
 
2404
- df = self.__add_fit_system_record_id(df, meaning_types, self.fit_search_keys, SYSTEM_RECORD_ID)
2547
+ df = self.__add_fit_system_record_id(df, self.fit_search_keys, SYSTEM_RECORD_ID)
2548
+
2549
+ if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
2550
+ df = df.drop(columns=DateTimeSearchKeyConverter.DATETIME_COL)
2551
+
2552
+ meaning_types[SYSTEM_RECORD_ID] = FileColumnMeaningType.SYSTEM_RECORD_ID
2405
2553
 
2406
2554
  df = df.reset_index(drop=True).sort_values(by=SYSTEM_RECORD_ID).reset_index(drop=True)
2407
2555
 
@@ -2419,8 +2567,7 @@ class FeaturesEnricher(TransformerMixin):
2419
2567
  rest_client=self.rest_client,
2420
2568
  logger=self.logger,
2421
2569
  )
2422
- if email_converted_to_hem:
2423
- dataset.ignore_columns = [email_column]
2570
+ dataset.columns_renaming = columns_renaming
2424
2571
 
2425
2572
  self.passed_features = [
2426
2573
  column for column, meaning_type in meaning_types.items() if meaning_type == FileColumnMeaningType.FEATURE
@@ -2809,7 +2956,7 @@ class FeaturesEnricher(TransformerMixin):
2809
2956
  if DateTimeSearchKeyConverter.DATETIME_COL in X.columns:
2810
2957
  date_column = DateTimeSearchKeyConverter.DATETIME_COL
2811
2958
  else:
2812
- date_column = FeaturesEnricher._get_date_column(search_keys)
2959
+ date_column = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
2813
2960
  sort_columns = [date_column] if date_column is not None else []
2814
2961
 
2815
2962
  # Xy = pd.concat([X, y], axis=1)
@@ -2905,10 +3052,10 @@ class FeaturesEnricher(TransformerMixin):
2905
3052
 
2906
3053
  do_without_pandas_limits(print_datasets_sample)
2907
3054
 
2908
- maybe_date_col = self._get_date_column(self.search_keys)
3055
+ maybe_date_col = SearchKey.find_key(self.search_keys, [SearchKey.DATE, SearchKey.DATETIME])
2909
3056
  if X is not None and maybe_date_col is not None and maybe_date_col in X.columns:
2910
3057
  # TODO cast date column to single dtype
2911
- date_converter = DateTimeSearchKeyConverter(maybe_date_col, self.date_format)
3058
+ date_converter = DateTimeSearchKeyConverter(maybe_date_col, self.date_format, silent_mode=True)
2912
3059
  converted_X = date_converter.convert(X)
2913
3060
  min_date = converted_X[maybe_date_col].min()
2914
3061
  max_date = converted_X[maybe_date_col].max()
@@ -2935,12 +3082,6 @@ class FeaturesEnricher(TransformerMixin):
2935
3082
 
2936
3083
  return df
2937
3084
 
2938
- @staticmethod
2939
- def _get_date_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
2940
- for col, t in search_keys.items():
2941
- if t in [SearchKey.DATE, SearchKey.DATETIME]:
2942
- return col
2943
-
2944
3085
  @staticmethod
2945
3086
  def _add_current_date_as_key(
2946
3087
  df: pd.DataFrame, search_keys: Dict[str, SearchKey], logger: logging.Logger, bundle: ResourceBundle
@@ -2956,7 +3097,7 @@ class FeaturesEnricher(TransformerMixin):
2956
3097
  logger.warning(msg)
2957
3098
  df[FeaturesEnricher.CURRENT_DATE] = datetime.date.today()
2958
3099
  search_keys[FeaturesEnricher.CURRENT_DATE] = SearchKey.DATE
2959
- converter = DateTimeSearchKeyConverter(FeaturesEnricher.CURRENT_DATE, None, logger, bundle)
3100
+ converter = DateTimeSearchKeyConverter(FeaturesEnricher.CURRENT_DATE, silent_mode=True)
2960
3101
  df = converter.convert(df)
2961
3102
  return df
2962
3103
 
@@ -2984,17 +3125,37 @@ class FeaturesEnricher(TransformerMixin):
2984
3125
  if len(cols) == 1:
2985
3126
  return cols[0]
2986
3127
 
3128
+ @staticmethod
3129
+ def _get_ip_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
3130
+ cols = [col for col, t in search_keys.items() if t == SearchKey.IP]
3131
+ if len(cols) > 1:
3132
+ raise Exception("More than one ip column found after unnest")
3133
+ if len(cols) == 1:
3134
+ return cols[0]
3135
+
2987
3136
  @staticmethod
2988
3137
  def _get_phone_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
2989
3138
  for col, t in search_keys.items():
2990
3139
  if t == SearchKey.PHONE:
2991
3140
  return col
2992
3141
 
3142
+ @staticmethod
3143
+ def _get_country_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
3144
+ for col, t in search_keys.items():
3145
+ if t == SearchKey.COUNTRY:
3146
+ return col
3147
+
3148
+ @staticmethod
3149
+ def _get_postal_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
3150
+ for col, t in search_keys.items():
3151
+ if t == SearchKey.POSTAL_CODE:
3152
+ return col
3153
+
2993
3154
  def _explode_multiple_search_keys(
2994
- self, df: pd.DataFrame, search_keys: Dict[str, SearchKey]
3155
+ self, df: pd.DataFrame, search_keys: Dict[str, SearchKey], columns_renaming: Dict[str, str]
2995
3156
  ) -> Tuple[pd.DataFrame, Dict[str, List[str]]]:
2996
3157
  # find groups of multiple search keys
2997
- search_key_names_by_type: Dict[SearchKey, str] = {}
3158
+ search_key_names_by_type: Dict[SearchKey, List[str]] = {}
2998
3159
  for key_name, key_type in search_keys.items():
2999
3160
  search_key_names_by_type[key_type] = search_key_names_by_type.get(key_type, []) + [key_name]
3000
3161
  search_key_names_by_type = {
@@ -3018,6 +3179,7 @@ class FeaturesEnricher(TransformerMixin):
3018
3179
  del search_keys[old_key]
3019
3180
  search_keys[new_search_key] = key_type
3020
3181
  unnest_search_keys[new_search_key] = key_names
3182
+ columns_renaming[new_search_key] = new_search_key
3021
3183
 
3022
3184
  df = pd.concat(exploded_dfs, ignore_index=True)
3023
3185
  return df, unnest_search_keys
@@ -3025,7 +3187,7 @@ class FeaturesEnricher(TransformerMixin):
3025
3187
  def __add_fit_system_record_id(
3026
3188
  self,
3027
3189
  df: pd.DataFrame,
3028
- meaning_types: Dict[str, FileColumnMeaningType],
3190
+ # meaning_types: Dict[str, FileColumnMeaningType],
3029
3191
  search_keys: Dict[str, SearchKey],
3030
3192
  id_name: str,
3031
3193
  ) -> pd.DataFrame:
@@ -3048,9 +3210,9 @@ class FeaturesEnricher(TransformerMixin):
3048
3210
  ]
3049
3211
  if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
3050
3212
  date_column = DateTimeSearchKeyConverter.DATETIME_COL
3051
- sort_exclude_columns.append(self._get_date_column(search_keys))
3213
+ sort_exclude_columns.append(SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME]))
3052
3214
  else:
3053
- date_column = self._get_date_column(search_keys)
3215
+ date_column = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
3054
3216
  sort_columns = [date_column] if date_column is not None else []
3055
3217
 
3056
3218
  other_columns = sorted(
@@ -3059,13 +3221,6 @@ class FeaturesEnricher(TransformerMixin):
3059
3221
  for c in df.columns
3060
3222
  if c not in sort_columns and c not in sort_exclude_columns and df[c].nunique() > 1
3061
3223
  ]
3062
- # [
3063
- # sk
3064
- # for sk, key_type in search_keys.items()
3065
- # if key_type not in [SearchKey.DATE, SearchKey.DATETIME]
3066
- # and sk in df.columns
3067
- # and df[sk].nunique() > 1 # don't use constant keys for hash
3068
- # ]
3069
3224
  )
3070
3225
 
3071
3226
  search_keys_hash = "search_keys_hash"
@@ -3078,9 +3233,6 @@ class FeaturesEnricher(TransformerMixin):
3078
3233
  if search_keys_hash in df.columns:
3079
3234
  df.drop(columns=search_keys_hash, inplace=True)
3080
3235
 
3081
- if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
3082
- df.drop(columns=DateTimeSearchKeyConverter.DATETIME_COL, inplace=True)
3083
-
3084
3236
  df = df.reset_index(drop=True).reset_index()
3085
3237
  # system_record_id saves correct order for fit
3086
3238
  df = df.rename(columns={DEFAULT_INDEX: id_name})
@@ -3090,11 +3242,11 @@ class FeaturesEnricher(TransformerMixin):
3090
3242
  df.index.name = original_index_name
3091
3243
  df = df.sort_values(by=original_order_name).drop(columns=original_order_name)
3092
3244
 
3093
- meaning_types[id_name] = (
3094
- FileColumnMeaningType.SYSTEM_RECORD_ID
3095
- if id_name == SYSTEM_RECORD_ID
3096
- else FileColumnMeaningType.ENTITY_SYSTEM_RECORD_ID
3097
- )
3245
+ # meaning_types[id_name] = (
3246
+ # FileColumnMeaningType.SYSTEM_RECORD_ID
3247
+ # if id_name == SYSTEM_RECORD_ID
3248
+ # else FileColumnMeaningType.ENTITY_SYSTEM_RECORD_ID
3249
+ # )
3098
3250
  return df
3099
3251
 
3100
3252
  def __correct_target(self, df: pd.DataFrame) -> pd.DataFrame:
@@ -3472,10 +3624,13 @@ class FeaturesEnricher(TransformerMixin):
3472
3624
  for _, key_type in search_keys.items():
3473
3625
  if not isinstance(key_type, SearchKey):
3474
3626
  raise ValidationError(self.bundle.get("unsupported_type_of_search_key").format(key_type))
3627
+
3475
3628
  valid_search_keys = {}
3476
3629
  unsupported_search_keys = {
3477
3630
  SearchKey.IP_RANGE_FROM,
3478
3631
  SearchKey.IP_RANGE_TO,
3632
+ SearchKey.IPV6_RANGE_FROM,
3633
+ SearchKey.IPV6_RANGE_TO,
3479
3634
  SearchKey.MSISDN_RANGE_FROM,
3480
3635
  SearchKey.MSISDN_RANGE_TO,
3481
3636
  # SearchKey.EMAIL_ONE_DOMAIN,
@@ -3565,6 +3720,7 @@ class FeaturesEnricher(TransformerMixin):
3565
3720
  print(msg)
3566
3721
  self.logger.warning(msg)
3567
3722
  self.warning_counter.increment()
3723
+ # TODO maybe raise ValidationError
3568
3724
 
3569
3725
  self.logger.info(f"Prepared search keys: {valid_search_keys}")
3570
3726