upgini 1.1.309a1__py3-none-any.whl → 1.1.309a3511.dev1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

@@ -61,15 +61,11 @@ from upgini.metadata import (
61
61
  SearchKey,
62
62
  )
63
63
  from upgini.metrics import EstimatorWrapper, validate_scoring_argument
64
- from upgini.normalizer.normalize_utils import Normalizer
65
64
  from upgini.resource_bundle import ResourceBundle, bundle, get_custom_bundle
66
65
  from upgini.search_task import SearchTask
67
66
  from upgini.spinner import Spinner
68
67
  from upgini.utils import combine_search_keys, find_numbers_with_decimal_comma
69
- from upgini.utils.country_utils import (
70
- CountrySearchKeyConverter,
71
- CountrySearchKeyDetector,
72
- )
68
+ from upgini.utils.country_utils import CountrySearchKeyDetector
73
69
  from upgini.utils.custom_loss_utils import (
74
70
  get_additional_params_custom_loss,
75
71
  get_runtime_params_custom_loss,
@@ -94,12 +90,8 @@ from upgini.utils.display_utils import (
94
90
  from upgini.utils.email_utils import EmailSearchKeyConverter, EmailSearchKeyDetector
95
91
  from upgini.utils.features_validator import FeaturesValidator
96
92
  from upgini.utils.format import Format
97
- from upgini.utils.ip_utils import IpSearchKeyConverter
98
- from upgini.utils.phone_utils import PhoneSearchKeyConverter, PhoneSearchKeyDetector
99
- from upgini.utils.postal_code_utils import (
100
- PostalCodeSearchKeyConverter,
101
- PostalCodeSearchKeyDetector,
102
- )
93
+ from upgini.utils.phone_utils import PhoneSearchKeyDetector
94
+ from upgini.utils.postal_code_utils import PostalCodeSearchKeyDetector
103
95
 
104
96
  try:
105
97
  from upgini.utils.progress_bar import CustomProgressBar as ProgressBar
@@ -245,7 +237,6 @@ class FeaturesEnricher(TransformerMixin):
245
237
 
246
238
  self.passed_features: List[str] = []
247
239
  self.df_with_original_index: Optional[pd.DataFrame] = None
248
- self.fit_columns_renaming: Optional[Dict[str, str]] = None
249
240
  self.country_added = False
250
241
  self.fit_generated_features: List[str] = []
251
242
  self.fit_dropped_features: Set[str] = set()
@@ -256,7 +247,7 @@ class FeaturesEnricher(TransformerMixin):
256
247
  self.eval_set: Optional[List[Tuple]] = None
257
248
  self.autodetected_search_keys: Dict[str, SearchKey] = {}
258
249
  self.imbalanced = False
259
- self.__cached_sampled_datasets: Optional[Tuple[pd.DataFrame, pd.DataFrame, pd.Series, Dict, Dict, Dict]] = None
250
+ self.__cached_sampled_datasets: Optional[Tuple[pd.DataFrame, pd.DataFrame, pd.Series, Dict, Dict]] = None
260
251
 
261
252
  validate_version(self.logger)
262
253
  self.search_keys = search_keys or {}
@@ -715,7 +706,7 @@ class FeaturesEnricher(TransformerMixin):
715
706
 
716
707
  start_time = time.time()
717
708
  try:
718
- result, _ = self.__inner_transform(
709
+ result = self.__inner_transform(
719
710
  trace_id,
720
711
  X,
721
712
  exclude_features_sources=exclude_features_sources,
@@ -915,16 +906,8 @@ class FeaturesEnricher(TransformerMixin):
915
906
  search_keys,
916
907
  groups,
917
908
  _cv,
918
- columns_renaming,
919
909
  ) = prepared_data
920
910
 
921
- # rename cat_features
922
- if cat_features:
923
- for new_c, old_c in columns_renaming.items():
924
- if old_c in cat_features:
925
- cat_features.remove(old_c)
926
- cat_features.append(new_c)
927
-
928
911
  gc.collect()
929
912
 
930
913
  print(self.bundle.get("metrics_start"))
@@ -937,7 +920,7 @@ class FeaturesEnricher(TransformerMixin):
937
920
 
938
921
  self._check_train_and_eval_target_distribution(y_sorted, fitting_eval_set_dict)
939
922
 
940
- has_date = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME]) is not None
923
+ has_date = self._get_date_column(search_keys) is not None
941
924
  model_task_type = self.model_task_type or define_task(y_sorted, has_date, self.logger, silent=True)
942
925
 
943
926
  wrapper = EstimatorWrapper.create(
@@ -1130,7 +1113,7 @@ class FeaturesEnricher(TransformerMixin):
1130
1113
  )
1131
1114
 
1132
1115
  uplift_col = self.bundle.get("quality_metrics_uplift_header")
1133
- date_column = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
1116
+ date_column = self._get_date_column(search_keys)
1134
1117
  if (
1135
1118
  uplift_col in metrics_df.columns
1136
1119
  and (metrics_df[uplift_col] < 0).any()
@@ -1218,11 +1201,9 @@ class FeaturesEnricher(TransformerMixin):
1218
1201
 
1219
1202
  extended_X = x.copy()
1220
1203
  generated_features = []
1221
- date_column = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
1204
+ date_column = self._get_date_column(search_keys)
1222
1205
  if date_column is not None:
1223
- converter = DateTimeSearchKeyConverter(
1224
- date_column, self.date_format, self.logger, self.bundle, silent_mode=True
1225
- )
1206
+ converter = DateTimeSearchKeyConverter(date_column, self.date_format, self.logger, self.bundle)
1226
1207
  extended_X = converter.convert(extended_X, keep_time=True)
1227
1208
  generated_features.extend(converter.generated_features)
1228
1209
  email_column = self._get_email_column(search_keys)
@@ -1278,7 +1259,7 @@ class FeaturesEnricher(TransformerMixin):
1278
1259
  groups = None
1279
1260
 
1280
1261
  if not isinstance(_cv, BaseCrossValidator):
1281
- date_column = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
1262
+ date_column = self._get_date_column(search_keys)
1282
1263
  date_series = X[date_column] if date_column is not None else None
1283
1264
  _cv, groups = CVConfig(
1284
1265
  _cv, date_series, self.random_state, self._search_task.get_shuffle_kfold(), group_columns=group_columns
@@ -1301,7 +1282,7 @@ class FeaturesEnricher(TransformerMixin):
1301
1282
 
1302
1283
  def _get_client_cat_features(
1303
1284
  self, estimator: Optional[Any], X: pd.DataFrame, search_keys: Dict[str, SearchKey]
1304
- ) -> Tuple[Optional[List[str]], List[str]]:
1285
+ ) -> Optional[List[str]]:
1305
1286
  cat_features = None
1306
1287
  search_keys_for_metrics = []
1307
1288
  if (
@@ -1361,15 +1342,11 @@ class FeaturesEnricher(TransformerMixin):
1361
1342
  progress_bar,
1362
1343
  progress_callback,
1363
1344
  )
1364
- X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys, columns_renaming = dataclasses.astuple(
1365
- sampled_data
1366
- )
1345
+ X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys = dataclasses.astuple(sampled_data)
1367
1346
 
1368
1347
  excluding_search_keys = list(search_keys.keys())
1369
1348
  if search_keys_for_metrics is not None and len(search_keys_for_metrics) > 0:
1370
- for sk in excluding_search_keys:
1371
- if columns_renaming.get(sk) in search_keys_for_metrics:
1372
- excluding_search_keys.remove(sk)
1349
+ excluding_search_keys = [sk for sk in excluding_search_keys if sk not in search_keys_for_metrics]
1373
1350
 
1374
1351
  client_features = [
1375
1352
  c
@@ -1415,7 +1392,6 @@ class FeaturesEnricher(TransformerMixin):
1415
1392
  fitting_X = fitting_X.drop(columns=constant_columns, errors="ignore")
1416
1393
  fitting_enriched_X = fitting_enriched_X.drop(columns=constant_columns, errors="ignore")
1417
1394
 
1418
- # TODO maybe there is no more need for these convertions
1419
1395
  # Remove datetime features
1420
1396
  datetime_features = [
1421
1397
  f for f in fitting_X.columns if is_datetime64_any_dtype(fitting_X[f]) or is_period_dtype(fitting_X[f])
@@ -1503,7 +1479,6 @@ class FeaturesEnricher(TransformerMixin):
1503
1479
  search_keys,
1504
1480
  groups,
1505
1481
  cv,
1506
- columns_renaming,
1507
1482
  )
1508
1483
 
1509
1484
  @dataclass
@@ -1513,7 +1488,6 @@ class FeaturesEnricher(TransformerMixin):
1513
1488
  enriched_X: pd.DataFrame
1514
1489
  eval_set_sampled_dict: Dict[int, Tuple[pd.DataFrame, pd.Series]]
1515
1490
  search_keys: Dict[str, SearchKey]
1516
- columns_renaming: Dict[str, str]
1517
1491
 
1518
1492
  def _sample_data_for_metrics(
1519
1493
  self,
@@ -1553,15 +1527,11 @@ class FeaturesEnricher(TransformerMixin):
1553
1527
  )
1554
1528
 
1555
1529
  def __get_sampled_cached_enriched(self, exclude_features_sources: Optional[List[str]]) -> _SampledDataForMetrics:
1556
- X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys, columns_renaming = (
1557
- self.__cached_sampled_datasets
1558
- )
1530
+ X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys = self.__cached_sampled_datasets
1559
1531
  if exclude_features_sources:
1560
1532
  enriched_X = enriched_X.drop(columns=exclude_features_sources, errors="ignore")
1561
1533
 
1562
- return self.__mk_sampled_data_tuple(
1563
- X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys, columns_renaming
1564
- )
1534
+ return self.__mk_sampled_data_tuple(X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys)
1565
1535
 
1566
1536
  def __sample_only_input(
1567
1537
  self, validated_X: pd.DataFrame, validated_y: pd.Series, eval_set: Optional[List[tuple]], is_demo_dataset: bool
@@ -1579,22 +1549,6 @@ class FeaturesEnricher(TransformerMixin):
1579
1549
  eval_xy[EVAL_SET_INDEX] = idx + 1
1580
1550
  df = pd.concat([df, eval_xy])
1581
1551
 
1582
- search_keys = self.search_keys.copy()
1583
- search_keys = self.__prepare_search_keys(df, search_keys, is_demo_dataset, is_transform=True, silent_mode=True)
1584
-
1585
- date_column = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
1586
- generated_features = []
1587
- if date_column is not None:
1588
- converter = DateTimeSearchKeyConverter(
1589
- date_column, self.date_format, self.logger, self.bundle, silent_mode=True
1590
- )
1591
- df = converter.convert(df, keep_time=True)
1592
- generated_features = converter.generated_features
1593
-
1594
- normalizer = Normalizer(self.search_keys, generated_features, self.bundle, self.logger, self.warning_counter)
1595
- df = normalizer.normalize(df)
1596
- columns_renaming = normalizer.columns_renaming
1597
-
1598
1552
  df = clean_full_duplicates(df, logger=self.logger, silent=True, bundle=self.bundle)
1599
1553
 
1600
1554
  num_samples = _num_samples(df)
@@ -1607,41 +1561,24 @@ class FeaturesEnricher(TransformerMixin):
1607
1561
  self.logger.info(f"Downsampling from {num_samples} to {sample_rows}")
1608
1562
  df = df.sample(n=sample_rows, random_state=self.random_state)
1609
1563
 
1610
- email_column = self._get_email_column(search_keys)
1611
- hem_column = self._get_hem_column(search_keys)
1612
- if email_column:
1613
- converter = EmailSearchKeyConverter(
1614
- email_column, hem_column, search_keys, columns_renaming, [], self.bundle, self.logger
1615
- )
1616
- df = converter.convert(df)
1617
- df = self.__add_fit_system_record_id(df, search_keys, SYSTEM_RECORD_ID)
1618
- if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
1619
- df = df.drop(columns=DateTimeSearchKeyConverter.DATETIME_COL)
1564
+ df_extended, search_keys = self._extend_x(df, is_demo_dataset)
1565
+ df_extended = self.__add_fit_system_record_id(df_extended, {}, search_keys, SYSTEM_RECORD_ID)
1620
1566
 
1621
- train_df = df.query(f"{EVAL_SET_INDEX} == 0") if eval_set is not None else df
1567
+ train_df = df_extended.query(f"{EVAL_SET_INDEX} == 0") if eval_set is not None else df_extended
1622
1568
  X_sampled = train_df.drop(columns=[TARGET, EVAL_SET_INDEX], errors="ignore")
1623
1569
  y_sampled = train_df[TARGET].copy()
1624
1570
  enriched_X = X_sampled
1625
1571
 
1626
1572
  if eval_set is not None:
1627
1573
  for idx in range(len(eval_set)):
1628
- eval_xy_sampled = df.query(f"{EVAL_SET_INDEX} == {idx + 1}")
1574
+ eval_xy_sampled = df_extended.query(f"{EVAL_SET_INDEX} == {idx + 1}")
1629
1575
  eval_X_sampled = eval_xy_sampled.drop(columns=[TARGET, EVAL_SET_INDEX], errors="ignore")
1630
1576
  eval_y_sampled = eval_xy_sampled[TARGET].copy()
1631
1577
  enriched_eval_X = eval_X_sampled
1632
1578
  eval_set_sampled_dict[idx] = (eval_X_sampled, enriched_eval_X, eval_y_sampled)
1633
- self.__cached_sampled_datasets = (
1634
- X_sampled,
1635
- y_sampled,
1636
- enriched_X,
1637
- eval_set_sampled_dict,
1638
- search_keys,
1639
- columns_renaming,
1640
- )
1579
+ self.__cached_sampled_datasets = (X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys)
1641
1580
 
1642
- return self.__mk_sampled_data_tuple(
1643
- X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys, columns_renaming
1644
- )
1581
+ return self.__mk_sampled_data_tuple(X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys)
1645
1582
 
1646
1583
  def __sample_balanced(
1647
1584
  self,
@@ -1653,7 +1590,7 @@ class FeaturesEnricher(TransformerMixin):
1653
1590
  search_keys = self.fit_search_keys
1654
1591
 
1655
1592
  rows_to_drop = None
1656
- has_date = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME]) is not None
1593
+ has_date = self._get_date_column(search_keys) is not None
1657
1594
  task_type = self.model_task_type or define_task(
1658
1595
  self.df_with_original_index[TARGET], has_date, self.logger, silent=True
1659
1596
  )
@@ -1707,18 +1644,9 @@ class FeaturesEnricher(TransformerMixin):
1707
1644
  enriched_eval_X = enriched_eval_sets[idx + 1].drop(columns=[TARGET, EVAL_SET_INDEX])
1708
1645
  eval_set_sampled_dict[idx] = (eval_X_sampled, enriched_eval_X, eval_y_sampled)
1709
1646
 
1710
- self.__cached_sampled_datasets = (
1711
- X_sampled,
1712
- y_sampled,
1713
- enriched_X,
1714
- eval_set_sampled_dict,
1715
- search_keys,
1716
- self.fit_columns_renaming,
1717
- )
1647
+ self.__cached_sampled_datasets = (X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys)
1718
1648
 
1719
- return self.__mk_sampled_data_tuple(
1720
- X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys, self.fit_columns_renaming
1721
- )
1649
+ return self.__mk_sampled_data_tuple(X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys)
1722
1650
 
1723
1651
  def __sample_imbalanced(
1724
1652
  self,
@@ -1758,7 +1686,7 @@ class FeaturesEnricher(TransformerMixin):
1758
1686
  tmp_target_name = "__target"
1759
1687
  df = df.rename(columns={TARGET: tmp_target_name})
1760
1688
 
1761
- enriched_df, columns_renaming = self.__inner_transform(
1689
+ enriched_df = self.__inner_transform(
1762
1690
  trace_id,
1763
1691
  df,
1764
1692
  exclude_features_sources=exclude_features_sources,
@@ -1806,7 +1734,7 @@ class FeaturesEnricher(TransformerMixin):
1806
1734
  tmp_target_name = "__target"
1807
1735
  df = df.rename(columns={TARGET: tmp_target_name})
1808
1736
 
1809
- enriched_Xy, columns_renaming = self.__inner_transform(
1737
+ enriched_Xy = self.__inner_transform(
1810
1738
  trace_id,
1811
1739
  df,
1812
1740
  exclude_features_sources=exclude_features_sources,
@@ -1831,18 +1759,9 @@ class FeaturesEnricher(TransformerMixin):
1831
1759
  y_sampled = enriched_Xy[TARGET].copy()
1832
1760
  enriched_X = enriched_Xy.drop(columns=TARGET)
1833
1761
 
1834
- self.__cached_sampled_datasets = (
1835
- X_sampled,
1836
- y_sampled,
1837
- enriched_X,
1838
- eval_set_sampled_dict,
1839
- self.search_keys,
1840
- columns_renaming,
1841
- )
1762
+ self.__cached_sampled_datasets = (X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, self.search_keys)
1842
1763
 
1843
- return self.__mk_sampled_data_tuple(
1844
- X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, self.search_keys, columns_renaming
1845
- )
1764
+ return self.__mk_sampled_data_tuple(X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, self.search_keys)
1846
1765
 
1847
1766
  def __mk_sampled_data_tuple(
1848
1767
  self,
@@ -1851,7 +1770,6 @@ class FeaturesEnricher(TransformerMixin):
1851
1770
  enriched_X: pd.DataFrame,
1852
1771
  eval_set_sampled_dict: Dict,
1853
1772
  search_keys: Dict,
1854
- columns_renaming: Dict[str, str],
1855
1773
  ):
1856
1774
  search_keys = {k: v for k, v in search_keys.items() if k in X_sampled.columns.to_list()}
1857
1775
  return FeaturesEnricher._SampledDataForMetrics(
@@ -1860,7 +1778,6 @@ class FeaturesEnricher(TransformerMixin):
1860
1778
  enriched_X=enriched_X,
1861
1779
  eval_set_sampled_dict=eval_set_sampled_dict,
1862
1780
  search_keys=search_keys,
1863
- columns_renaming=columns_renaming,
1864
1781
  )
1865
1782
 
1866
1783
  def get_search_id(self) -> Optional[str]:
@@ -1949,7 +1866,7 @@ class FeaturesEnricher(TransformerMixin):
1949
1866
  progress_bar: Optional[ProgressBar] = None,
1950
1867
  progress_callback: Optional[Callable[[SearchProgress], Any]] = None,
1951
1868
  add_fit_system_record_id: bool = False,
1952
- ) -> Tuple[pd.DataFrame, Dict[str, str]]:
1869
+ ) -> pd.DataFrame:
1953
1870
  if self._search_task is None:
1954
1871
  raise NotFittedError(self.bundle.get("transform_unfitted_enricher"))
1955
1872
 
@@ -1962,13 +1879,13 @@ class FeaturesEnricher(TransformerMixin):
1962
1879
 
1963
1880
  if len(self.feature_names_) == 0:
1964
1881
  self.logger.warning(self.bundle.get("no_important_features_for_transform"))
1965
- return X, {c: c for c in X.columns}
1882
+ return X
1966
1883
 
1967
1884
  if self._has_paid_features(exclude_features_sources):
1968
1885
  msg = self.bundle.get("transform_with_paid_features")
1969
1886
  self.logger.warning(msg)
1970
1887
  self.__display_support_link(msg)
1971
- return None, {c: c for c in X.columns}
1888
+ return None
1972
1889
 
1973
1890
  if not metrics_calculation:
1974
1891
  transform_usage = self.rest_client.get_current_transform_usage(trace_id)
@@ -1979,7 +1896,7 @@ class FeaturesEnricher(TransformerMixin):
1979
1896
  self.logger.warning(msg)
1980
1897
  print(msg)
1981
1898
  show_request_quote_button()
1982
- return None, {c: c for c in X.columns}
1899
+ return None
1983
1900
  else:
1984
1901
  msg = self.bundle.get("transform_usage_info").format(
1985
1902
  transform_usage.limit, transform_usage.transformed_rows
@@ -2017,11 +1934,9 @@ class FeaturesEnricher(TransformerMixin):
2017
1934
  df = self.__add_country_code(df, search_keys)
2018
1935
 
2019
1936
  generated_features = []
2020
- date_column = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
1937
+ date_column = self._get_date_column(search_keys)
2021
1938
  if date_column is not None:
2022
- converter = DateTimeSearchKeyConverter(
2023
- date_column, self.date_format, self.logger, bundle=self.bundle, silent_mode=silent_mode
2024
- )
1939
+ converter = DateTimeSearchKeyConverter(date_column, self.date_format, self.logger, bundle=self.bundle)
2025
1940
  df = converter.convert(df)
2026
1941
  self.logger.info(f"Date column after convertion: {df[date_column]}")
2027
1942
  generated_features.extend(converter.generated_features)
@@ -2030,87 +1945,61 @@ class FeaturesEnricher(TransformerMixin):
2030
1945
  if self.add_date_if_missing:
2031
1946
  df = self._add_current_date_as_key(df, search_keys, self.logger, self.bundle)
2032
1947
 
2033
- normalizer = Normalizer(
2034
- search_keys, generated_features, self.bundle, self.logger, self.warning_counter, silent_mode
2035
- )
2036
- df = normalizer.normalize(df)
2037
- columns_renaming = normalizer.columns_renaming
2038
-
2039
1948
  # Don't pass all features in backend on transform
1949
+ original_features_for_transform = []
2040
1950
  runtime_parameters = self._get_copy_of_runtime_parameters()
2041
- features_for_transform = self._search_task.get_features_for_transform() or []
2042
- if len(features_for_transform) > 0:
2043
- runtime_parameters.properties["features_for_embeddings"] = ",".join(features_for_transform)
1951
+ features_not_to_pass = [column for column in df.columns if column not in search_keys.keys()]
1952
+ if len(features_not_to_pass) > 0:
1953
+ # Pass only features that need for transform
1954
+ features_for_transform = self._search_task.get_features_for_transform()
1955
+ if features_for_transform is not None and len(features_for_transform) > 0:
1956
+ file_metadata = self._search_task.get_file_metadata(trace_id)
1957
+ original_features_for_transform = [
1958
+ c.originalName or c.name for c in file_metadata.columns if c.name in features_for_transform
1959
+ ]
1960
+
1961
+ runtime_parameters.properties["features_for_embeddings"] = ",".join(features_for_transform)
2044
1962
 
2045
- columns_for_system_record_id = sorted(list(search_keys.keys()) + features_for_transform)
1963
+ columns_for_system_record_id = sorted(list(search_keys.keys()) + (original_features_for_transform))
2046
1964
 
2047
1965
  df[ENTITY_SYSTEM_RECORD_ID] = pd.util.hash_pandas_object(
2048
1966
  df[columns_for_system_record_id], index=False
2049
1967
  ).astype("Float64")
2050
1968
 
2051
1969
  # Explode multiple search keys
2052
- df, unnest_search_keys = self._explode_multiple_search_keys(df, search_keys, columns_renaming)
1970
+ df, unnest_search_keys = self._explode_multiple_search_keys(df, search_keys)
2053
1971
 
2054
1972
  email_column = self._get_email_column(search_keys)
2055
1973
  hem_column = self._get_hem_column(search_keys)
2056
- # email_converted_to_hem = False
1974
+ email_converted_to_hem = False
2057
1975
  if email_column:
2058
1976
  converter = EmailSearchKeyConverter(
2059
- email_column,
2060
- hem_column,
2061
- search_keys,
2062
- columns_renaming,
2063
- list(unnest_search_keys.keys()),
2064
- self.logger,
1977
+ email_column, hem_column, search_keys, list(unnest_search_keys.keys()), self.logger
2065
1978
  )
2066
1979
  df = converter.convert(df)
2067
1980
  generated_features.extend(converter.generated_features)
2068
-
2069
- ip_column = self._get_ip_column(search_keys)
2070
- if ip_column:
2071
- converter = IpSearchKeyConverter(
2072
- ip_column,
2073
- search_keys,
2074
- columns_renaming,
2075
- list(unnest_search_keys.keys()),
2076
- self.bundle,
2077
- self.logger,
2078
- )
2079
- df = converter.convert(df)
2080
-
2081
- phone_column = self._get_phone_column(search_keys)
2082
- country_column = self._get_country_column(search_keys)
2083
- if phone_column:
2084
- converter = PhoneSearchKeyConverter(phone_column, country_column)
2085
- df = converter.convert(df)
2086
-
2087
- if country_column:
2088
- converter = CountrySearchKeyConverter(country_column)
2089
- df = converter.convert(df)
2090
-
2091
- postal_code = self._get_postal_column(search_keys)
2092
- if postal_code:
2093
- converter = PostalCodeSearchKeyConverter(postal_code)
2094
- df = converter.convert(df)
2095
-
1981
+ email_converted_to_hem = converter.email_converted_to_hem
2096
1982
  generated_features = [f for f in generated_features if f in self.fit_generated_features]
2097
1983
 
2098
1984
  meaning_types = {col: key.value for col, key in search_keys.items()}
2099
- for col in features_for_transform:
1985
+ # non_keys_columns = [column for column in df.columns if column not in search_keys.keys()]
1986
+ for col in original_features_for_transform:
2100
1987
  meaning_types[col] = FileColumnMeaningType.FEATURE
2101
- features_not_to_pass = [
2102
- c for c in df.columns if c not in search_keys.keys() and c not in features_for_transform
2103
- ]
1988
+ features_not_to_pass = [column for column in features_not_to_pass if column not in search_keys.keys()]
1989
+
1990
+ if email_converted_to_hem:
1991
+ features_not_to_pass.append(email_column)
1992
+
1993
+ features_not_to_pass = [c for c in features_not_to_pass if c not in original_features_for_transform]
1994
+ columns_for_system_record_id = sorted(list(search_keys.keys()) + (original_features_for_transform))
2104
1995
 
2105
1996
  if add_fit_system_record_id:
2106
- df = self.__add_fit_system_record_id(df, search_keys, SYSTEM_RECORD_ID)
2107
- if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
2108
- df = df.drop(columns=DateTimeSearchKeyConverter.DATETIME_COL)
1997
+ df = self.__add_fit_system_record_id(df, {}, search_keys, SYSTEM_RECORD_ID)
2109
1998
  df = df.rename(columns={SYSTEM_RECORD_ID: SORT_ID})
2110
1999
  features_not_to_pass.append(SORT_ID)
2111
2000
 
2112
- # search keys might be changed after explode
2113
- columns_for_system_record_id = sorted(list(search_keys.keys()) + features_for_transform)
2001
+ columns_for_system_record_id = sorted(list(search_keys.keys()) + (original_features_for_transform or []))
2002
+
2114
2003
  df[SYSTEM_RECORD_ID] = pd.util.hash_pandas_object(df[columns_for_system_record_id], index=False).astype(
2115
2004
  "Float64"
2116
2005
  )
@@ -2146,7 +2035,8 @@ class FeaturesEnricher(TransformerMixin):
2146
2035
  rest_client=self.rest_client,
2147
2036
  logger=self.logger,
2148
2037
  )
2149
- dataset.columns_renaming = columns_renaming
2038
+ if email_converted_to_hem:
2039
+ dataset.ignore_columns = [email_column]
2150
2040
 
2151
2041
  if max_features is not None or importance_threshold is not None:
2152
2042
  exclude_features_sources = list(
@@ -2248,7 +2138,7 @@ class FeaturesEnricher(TransformerMixin):
2248
2138
  if add_fit_system_record_id:
2249
2139
  result = result.rename(columns={SORT_ID: SYSTEM_RECORD_ID})
2250
2140
 
2251
- return result, columns_renaming
2141
+ return result
2252
2142
 
2253
2143
  def _get_excluded_features(self, max_features: Optional[int], importance_threshold: Optional[float]) -> List[str]:
2254
2144
  features_info = self._internal_features_info
@@ -2349,9 +2239,6 @@ class FeaturesEnricher(TransformerMixin):
2349
2239
  self.df_with_original_index = None
2350
2240
  self.__cached_sampled_datasets = None
2351
2241
  self.metrics = None
2352
- self.fit_columns_renaming = None
2353
- self.fit_dropped_features = set()
2354
- self.fit_generated_features = []
2355
2242
 
2356
2243
  validated_X = self._validate_X(X)
2357
2244
  validated_y = self._validate_y(validated_X, y)
@@ -2398,10 +2285,9 @@ class FeaturesEnricher(TransformerMixin):
2398
2285
  self.fit_search_keys = self.search_keys.copy()
2399
2286
  self.fit_search_keys = self.__prepare_search_keys(validated_X, self.fit_search_keys, is_demo_dataset)
2400
2287
 
2401
- maybe_date_column = SearchKey.find_key(self.fit_search_keys, [SearchKey.DATE, SearchKey.DATETIME])
2288
+ maybe_date_column = self._get_date_column(self.fit_search_keys)
2402
2289
  has_date = maybe_date_column is not None
2403
2290
  model_task_type = self.model_task_type or define_task(validated_y, has_date, self.logger)
2404
-
2405
2291
  self._validate_binary_observations(validated_y, model_task_type)
2406
2292
 
2407
2293
  self.runtime_parameters = get_runtime_params_custom_loss(
@@ -2431,13 +2317,7 @@ class FeaturesEnricher(TransformerMixin):
2431
2317
  self.fit_generated_features = []
2432
2318
 
2433
2319
  if has_date:
2434
- converter = DateTimeSearchKeyConverter(
2435
- maybe_date_column,
2436
- self.date_format,
2437
- self.logger,
2438
- bundle=self.bundle,
2439
- warnings_counter=self.warning_counter,
2440
- )
2320
+ converter = DateTimeSearchKeyConverter(maybe_date_column, self.date_format, self.logger, bundle=self.bundle)
2441
2321
  df = converter.convert(df, keep_time=True)
2442
2322
  self.logger.info(f"Date column after convertion: {df[maybe_date_column]}")
2443
2323
  self.fit_generated_features.extend(converter.generated_features)
@@ -2454,12 +2334,7 @@ class FeaturesEnricher(TransformerMixin):
2454
2334
 
2455
2335
  self.__adjust_cv(df, maybe_date_column, model_task_type)
2456
2336
 
2457
- normalizer = Normalizer(
2458
- self.fit_search_keys, self.fit_generated_features, self.bundle, self.logger, self.warning_counter
2459
- )
2460
- df = normalizer.normalize(df)
2461
- columns_renaming = normalizer.columns_renaming
2462
- self.fit_columns_renaming = columns_renaming
2337
+ # TODO normalize and convert all columns
2463
2338
 
2464
2339
  df = remove_fintech_duplicates(
2465
2340
  df, self.fit_search_keys, date_format=self.date_format, logger=self.logger, bundle=self.bundle
@@ -2467,59 +2342,38 @@ class FeaturesEnricher(TransformerMixin):
2467
2342
  df = clean_full_duplicates(df, self.logger, bundle=self.bundle)
2468
2343
 
2469
2344
  # Explode multiple search keys
2470
- df = self.__add_fit_system_record_id(df, self.fit_search_keys, ENTITY_SYSTEM_RECORD_ID)
2345
+ non_feature_columns = [self.TARGET_NAME, EVAL_SET_INDEX] + list(self.fit_search_keys.keys())
2346
+ meaning_types = {
2347
+ **{col: key.value for col, key in self.fit_search_keys.items()},
2348
+ **{str(c): FileColumnMeaningType.FEATURE for c in df.columns if c not in non_feature_columns},
2349
+ }
2350
+ meaning_types[self.TARGET_NAME] = FileColumnMeaningType.TARGET
2351
+ if eval_set is not None and len(eval_set) > 0:
2352
+ meaning_types[EVAL_SET_INDEX] = FileColumnMeaningType.EVAL_SET_INDEX
2353
+ df = self.__add_fit_system_record_id(df, meaning_types, self.fit_search_keys, ENTITY_SYSTEM_RECORD_ID)
2471
2354
 
2472
2355
  # TODO check that this is correct for enrichment
2473
2356
  self.df_with_original_index = df.copy()
2474
- # TODO check maybe need to drop _time column from df_with_original_index
2475
2357
 
2476
- df, unnest_search_keys = self._explode_multiple_search_keys(df, self.fit_search_keys, columns_renaming)
2358
+ df, unnest_search_keys = self._explode_multiple_search_keys(df, self.fit_search_keys)
2477
2359
 
2478
2360
  # Convert EMAIL to HEM after unnesting to do it only with one column
2479
2361
  email_column = self._get_email_column(self.fit_search_keys)
2480
2362
  hem_column = self._get_hem_column(self.fit_search_keys)
2363
+ email_converted_to_hem = False
2481
2364
  if email_column:
2482
2365
  converter = EmailSearchKeyConverter(
2483
- email_column,
2484
- hem_column,
2485
- self.fit_search_keys,
2486
- columns_renaming,
2487
- list(unnest_search_keys.keys()),
2488
- self.logger,
2366
+ email_column, hem_column, self.fit_search_keys, list(unnest_search_keys.keys()), self.logger
2489
2367
  )
2490
2368
  df = converter.convert(df)
2491
2369
  self.fit_generated_features.extend(converter.generated_features)
2492
-
2493
- ip_column = self._get_ip_column(self.fit_search_keys)
2494
- if ip_column:
2495
- converter = IpSearchKeyConverter(
2496
- ip_column,
2497
- self.fit_search_keys,
2498
- columns_renaming,
2499
- list(unnest_search_keys.keys()),
2500
- self.bundle,
2501
- self.logger,
2502
- )
2503
- df = converter.convert(df)
2504
-
2505
- phone_column = self._get_phone_column(self.fit_search_keys)
2506
- country_column = self._get_country_column(self.fit_search_keys)
2507
- if phone_column:
2508
- converter = PhoneSearchKeyConverter(phone_column, country_column)
2509
- df = converter.convert(df)
2510
-
2511
- if country_column:
2512
- converter = CountrySearchKeyConverter(country_column)
2513
- df = converter.convert(df)
2514
-
2515
- postal_code = self._get_postal_column(self.fit_search_keys)
2516
- if postal_code:
2517
- converter = PostalCodeSearchKeyConverter(postal_code)
2518
- df = converter.convert(df)
2370
+ email_converted_to_hem = converter.email_converted_to_hem
2519
2371
 
2520
2372
  non_feature_columns = [self.TARGET_NAME, EVAL_SET_INDEX, ENTITY_SYSTEM_RECORD_ID, SEARCH_KEY_UNNEST] + list(
2521
2373
  self.fit_search_keys.keys()
2522
2374
  )
2375
+ if email_converted_to_hem:
2376
+ non_feature_columns.append(email_column)
2523
2377
  if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
2524
2378
  non_feature_columns.append(DateTimeSearchKeyConverter.DATETIME_COL)
2525
2379
 
@@ -2531,6 +2385,9 @@ class FeaturesEnricher(TransformerMixin):
2531
2385
  self.fit_dropped_features.update(features_to_drop)
2532
2386
  df = df.drop(columns=features_to_drop)
2533
2387
 
2388
+ if email_converted_to_hem:
2389
+ self.fit_dropped_features.add(email_column)
2390
+
2534
2391
  self.fit_generated_features = [f for f in self.fit_generated_features if f not in self.fit_dropped_features]
2535
2392
 
2536
2393
  meaning_types = {
@@ -2544,12 +2401,7 @@ class FeaturesEnricher(TransformerMixin):
2544
2401
  if eval_set is not None and len(eval_set) > 0:
2545
2402
  meaning_types[EVAL_SET_INDEX] = FileColumnMeaningType.EVAL_SET_INDEX
2546
2403
 
2547
- df = self.__add_fit_system_record_id(df, self.fit_search_keys, SYSTEM_RECORD_ID)
2548
-
2549
- if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
2550
- df = df.drop(columns=DateTimeSearchKeyConverter.DATETIME_COL)
2551
-
2552
- meaning_types[SYSTEM_RECORD_ID] = FileColumnMeaningType.SYSTEM_RECORD_ID
2404
+ df = self.__add_fit_system_record_id(df, meaning_types, self.fit_search_keys, SYSTEM_RECORD_ID)
2553
2405
 
2554
2406
  df = df.reset_index(drop=True).sort_values(by=SYSTEM_RECORD_ID).reset_index(drop=True)
2555
2407
 
@@ -2567,7 +2419,8 @@ class FeaturesEnricher(TransformerMixin):
2567
2419
  rest_client=self.rest_client,
2568
2420
  logger=self.logger,
2569
2421
  )
2570
- dataset.columns_renaming = columns_renaming
2422
+ if email_converted_to_hem:
2423
+ dataset.ignore_columns = [email_column]
2571
2424
 
2572
2425
  self.passed_features = [
2573
2426
  column for column, meaning_type in meaning_types.items() if meaning_type == FileColumnMeaningType.FEATURE
@@ -2956,7 +2809,7 @@ class FeaturesEnricher(TransformerMixin):
2956
2809
  if DateTimeSearchKeyConverter.DATETIME_COL in X.columns:
2957
2810
  date_column = DateTimeSearchKeyConverter.DATETIME_COL
2958
2811
  else:
2959
- date_column = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
2812
+ date_column = FeaturesEnricher._get_date_column(search_keys)
2960
2813
  sort_columns = [date_column] if date_column is not None else []
2961
2814
 
2962
2815
  # Xy = pd.concat([X, y], axis=1)
@@ -3052,10 +2905,10 @@ class FeaturesEnricher(TransformerMixin):
3052
2905
 
3053
2906
  do_without_pandas_limits(print_datasets_sample)
3054
2907
 
3055
- maybe_date_col = SearchKey.find_key(self.search_keys, [SearchKey.DATE, SearchKey.DATETIME])
2908
+ maybe_date_col = self._get_date_column(self.search_keys)
3056
2909
  if X is not None and maybe_date_col is not None and maybe_date_col in X.columns:
3057
2910
  # TODO cast date column to single dtype
3058
- date_converter = DateTimeSearchKeyConverter(maybe_date_col, self.date_format, silent_mode=True)
2911
+ date_converter = DateTimeSearchKeyConverter(maybe_date_col, self.date_format)
3059
2912
  converted_X = date_converter.convert(X)
3060
2913
  min_date = converted_X[maybe_date_col].min()
3061
2914
  max_date = converted_X[maybe_date_col].max()
@@ -3082,6 +2935,12 @@ class FeaturesEnricher(TransformerMixin):
3082
2935
 
3083
2936
  return df
3084
2937
 
2938
+ @staticmethod
2939
+ def _get_date_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
2940
+ for col, t in search_keys.items():
2941
+ if t in [SearchKey.DATE, SearchKey.DATETIME]:
2942
+ return col
2943
+
3085
2944
  @staticmethod
3086
2945
  def _add_current_date_as_key(
3087
2946
  df: pd.DataFrame, search_keys: Dict[str, SearchKey], logger: logging.Logger, bundle: ResourceBundle
@@ -3097,7 +2956,7 @@ class FeaturesEnricher(TransformerMixin):
3097
2956
  logger.warning(msg)
3098
2957
  df[FeaturesEnricher.CURRENT_DATE] = datetime.date.today()
3099
2958
  search_keys[FeaturesEnricher.CURRENT_DATE] = SearchKey.DATE
3100
- converter = DateTimeSearchKeyConverter(FeaturesEnricher.CURRENT_DATE, silent_mode=True)
2959
+ converter = DateTimeSearchKeyConverter(FeaturesEnricher.CURRENT_DATE, None, logger, bundle)
3101
2960
  df = converter.convert(df)
3102
2961
  return df
3103
2962
 
@@ -3125,37 +2984,17 @@ class FeaturesEnricher(TransformerMixin):
3125
2984
  if len(cols) == 1:
3126
2985
  return cols[0]
3127
2986
 
3128
- @staticmethod
3129
- def _get_ip_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
3130
- cols = [col for col, t in search_keys.items() if t == SearchKey.IP]
3131
- if len(cols) > 1:
3132
- raise Exception("More than one ip column found after unnest")
3133
- if len(cols) == 1:
3134
- return cols[0]
3135
-
3136
2987
  @staticmethod
3137
2988
  def _get_phone_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
3138
2989
  for col, t in search_keys.items():
3139
2990
  if t == SearchKey.PHONE:
3140
2991
  return col
3141
2992
 
3142
- @staticmethod
3143
- def _get_country_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
3144
- for col, t in search_keys.items():
3145
- if t == SearchKey.COUNTRY:
3146
- return col
3147
-
3148
- @staticmethod
3149
- def _get_postal_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
3150
- for col, t in search_keys.items():
3151
- if t == SearchKey.POSTAL_CODE:
3152
- return col
3153
-
3154
2993
  def _explode_multiple_search_keys(
3155
- self, df: pd.DataFrame, search_keys: Dict[str, SearchKey], columns_renaming: Dict[str, str]
2994
+ self, df: pd.DataFrame, search_keys: Dict[str, SearchKey]
3156
2995
  ) -> Tuple[pd.DataFrame, Dict[str, List[str]]]:
3157
2996
  # find groups of multiple search keys
3158
- search_key_names_by_type: Dict[SearchKey, List[str]] = {}
2997
+ search_key_names_by_type: Dict[SearchKey, str] = {}
3159
2998
  for key_name, key_type in search_keys.items():
3160
2999
  search_key_names_by_type[key_type] = search_key_names_by_type.get(key_type, []) + [key_name]
3161
3000
  search_key_names_by_type = {
@@ -3179,7 +3018,6 @@ class FeaturesEnricher(TransformerMixin):
3179
3018
  del search_keys[old_key]
3180
3019
  search_keys[new_search_key] = key_type
3181
3020
  unnest_search_keys[new_search_key] = key_names
3182
- columns_renaming[new_search_key] = new_search_key
3183
3021
 
3184
3022
  df = pd.concat(exploded_dfs, ignore_index=True)
3185
3023
  return df, unnest_search_keys
@@ -3187,7 +3025,7 @@ class FeaturesEnricher(TransformerMixin):
3187
3025
  def __add_fit_system_record_id(
3188
3026
  self,
3189
3027
  df: pd.DataFrame,
3190
- # meaning_types: Dict[str, FileColumnMeaningType],
3028
+ meaning_types: Dict[str, FileColumnMeaningType],
3191
3029
  search_keys: Dict[str, SearchKey],
3192
3030
  id_name: str,
3193
3031
  ) -> pd.DataFrame:
@@ -3210,9 +3048,9 @@ class FeaturesEnricher(TransformerMixin):
3210
3048
  ]
3211
3049
  if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
3212
3050
  date_column = DateTimeSearchKeyConverter.DATETIME_COL
3213
- sort_exclude_columns.append(SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME]))
3051
+ sort_exclude_columns.append(self._get_date_column(search_keys))
3214
3052
  else:
3215
- date_column = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
3053
+ date_column = self._get_date_column(search_keys)
3216
3054
  sort_columns = [date_column] if date_column is not None else []
3217
3055
 
3218
3056
  other_columns = sorted(
@@ -3221,6 +3059,13 @@ class FeaturesEnricher(TransformerMixin):
3221
3059
  for c in df.columns
3222
3060
  if c not in sort_columns and c not in sort_exclude_columns and df[c].nunique() > 1
3223
3061
  ]
3062
+ # [
3063
+ # sk
3064
+ # for sk, key_type in search_keys.items()
3065
+ # if key_type not in [SearchKey.DATE, SearchKey.DATETIME]
3066
+ # and sk in df.columns
3067
+ # and df[sk].nunique() > 1 # don't use constant keys for hash
3068
+ # ]
3224
3069
  )
3225
3070
 
3226
3071
  search_keys_hash = "search_keys_hash"
@@ -3233,6 +3078,9 @@ class FeaturesEnricher(TransformerMixin):
3233
3078
  if search_keys_hash in df.columns:
3234
3079
  df.drop(columns=search_keys_hash, inplace=True)
3235
3080
 
3081
+ if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
3082
+ df.drop(columns=DateTimeSearchKeyConverter.DATETIME_COL, inplace=True)
3083
+
3236
3084
  df = df.reset_index(drop=True).reset_index()
3237
3085
  # system_record_id saves correct order for fit
3238
3086
  df = df.rename(columns={DEFAULT_INDEX: id_name})
@@ -3242,11 +3090,11 @@ class FeaturesEnricher(TransformerMixin):
3242
3090
  df.index.name = original_index_name
3243
3091
  df = df.sort_values(by=original_order_name).drop(columns=original_order_name)
3244
3092
 
3245
- # meaning_types[id_name] = (
3246
- # FileColumnMeaningType.SYSTEM_RECORD_ID
3247
- # if id_name == SYSTEM_RECORD_ID
3248
- # else FileColumnMeaningType.ENTITY_SYSTEM_RECORD_ID
3249
- # )
3093
+ meaning_types[id_name] = (
3094
+ FileColumnMeaningType.SYSTEM_RECORD_ID
3095
+ if id_name == SYSTEM_RECORD_ID
3096
+ else FileColumnMeaningType.ENTITY_SYSTEM_RECORD_ID
3097
+ )
3250
3098
  return df
3251
3099
 
3252
3100
  def __correct_target(self, df: pd.DataFrame) -> pd.DataFrame:
@@ -3624,13 +3472,10 @@ class FeaturesEnricher(TransformerMixin):
3624
3472
  for _, key_type in search_keys.items():
3625
3473
  if not isinstance(key_type, SearchKey):
3626
3474
  raise ValidationError(self.bundle.get("unsupported_type_of_search_key").format(key_type))
3627
-
3628
3475
  valid_search_keys = {}
3629
3476
  unsupported_search_keys = {
3630
3477
  SearchKey.IP_RANGE_FROM,
3631
3478
  SearchKey.IP_RANGE_TO,
3632
- SearchKey.IPV6_RANGE_FROM,
3633
- SearchKey.IPV6_RANGE_TO,
3634
3479
  SearchKey.MSISDN_RANGE_FROM,
3635
3480
  SearchKey.MSISDN_RANGE_TO,
3636
3481
  # SearchKey.EMAIL_ONE_DOMAIN,
@@ -3720,7 +3565,6 @@ class FeaturesEnricher(TransformerMixin):
3720
3565
  print(msg)
3721
3566
  self.logger.warning(msg)
3722
3567
  self.warning_counter.increment()
3723
- # TODO maybe raise ValidationError
3724
3568
 
3725
3569
  self.logger.info(f"Prepared search keys: {valid_search_keys}")
3726
3570