upgini 1.1.315__py3-none-any.whl → 1.1.315a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

@@ -61,11 +61,15 @@ from upgini.metadata import (
61
61
  SearchKey,
62
62
  )
63
63
  from upgini.metrics import EstimatorWrapper, validate_scoring_argument
64
+ from upgini.normalizer.normalize_utils import Normalizer
64
65
  from upgini.resource_bundle import ResourceBundle, bundle, get_custom_bundle
65
66
  from upgini.search_task import SearchTask
66
67
  from upgini.spinner import Spinner
67
68
  from upgini.utils import combine_search_keys, find_numbers_with_decimal_comma
68
- from upgini.utils.country_utils import CountrySearchKeyDetector
69
+ from upgini.utils.country_utils import (
70
+ CountrySearchKeyConverter,
71
+ CountrySearchKeyDetector,
72
+ )
69
73
  from upgini.utils.custom_loss_utils import (
70
74
  get_additional_params_custom_loss,
71
75
  get_runtime_params_custom_loss,
@@ -87,11 +91,19 @@ from upgini.utils.display_utils import (
87
91
  prepare_and_show_report,
88
92
  show_request_quote_button,
89
93
  )
90
- from upgini.utils.email_utils import EmailSearchKeyConverter, EmailSearchKeyDetector
94
+ from upgini.utils.email_utils import (
95
+ EmailDomainGenerator,
96
+ EmailSearchKeyConverter,
97
+ EmailSearchKeyDetector,
98
+ )
91
99
  from upgini.utils.features_validator import FeaturesValidator
92
100
  from upgini.utils.format import Format
93
- from upgini.utils.phone_utils import PhoneSearchKeyDetector
94
- from upgini.utils.postal_code_utils import PostalCodeSearchKeyDetector
101
+ from upgini.utils.ip_utils import IpSearchKeyConverter
102
+ from upgini.utils.phone_utils import PhoneSearchKeyConverter, PhoneSearchKeyDetector
103
+ from upgini.utils.postal_code_utils import (
104
+ PostalCodeSearchKeyConverter,
105
+ PostalCodeSearchKeyDetector,
106
+ )
95
107
 
96
108
  try:
97
109
  from upgini.utils.progress_bar import CustomProgressBar as ProgressBar
@@ -237,6 +249,7 @@ class FeaturesEnricher(TransformerMixin):
237
249
 
238
250
  self.passed_features: List[str] = []
239
251
  self.df_with_original_index: Optional[pd.DataFrame] = None
252
+ self.fit_columns_renaming: Optional[Dict[str, str]] = None
240
253
  self.country_added = False
241
254
  self.fit_generated_features: List[str] = []
242
255
  self.fit_dropped_features: Set[str] = set()
@@ -247,7 +260,7 @@ class FeaturesEnricher(TransformerMixin):
247
260
  self.eval_set: Optional[List[Tuple]] = None
248
261
  self.autodetected_search_keys: Dict[str, SearchKey] = {}
249
262
  self.imbalanced = False
250
- self.__cached_sampled_datasets: Optional[Tuple[pd.DataFrame, pd.DataFrame, pd.Series, Dict, Dict]] = None
263
+ self.__cached_sampled_datasets: Optional[Tuple[pd.DataFrame, pd.DataFrame, pd.Series, Dict, Dict, Dict]] = None
251
264
 
252
265
  validate_version(self.logger)
253
266
  self.search_keys = search_keys or {}
@@ -706,7 +719,7 @@ class FeaturesEnricher(TransformerMixin):
706
719
 
707
720
  start_time = time.time()
708
721
  try:
709
- result = self.__inner_transform(
722
+ result, _ = self.__inner_transform(
710
723
  trace_id,
711
724
  X,
712
725
  exclude_features_sources=exclude_features_sources,
@@ -906,8 +919,16 @@ class FeaturesEnricher(TransformerMixin):
906
919
  search_keys,
907
920
  groups,
908
921
  _cv,
922
+ columns_renaming,
909
923
  ) = prepared_data
910
924
 
925
+ # rename cat_features
926
+ if cat_features:
927
+ for new_c, old_c in columns_renaming.items():
928
+ if old_c in cat_features:
929
+ cat_features.remove(old_c)
930
+ cat_features.append(new_c)
931
+
911
932
  gc.collect()
912
933
 
913
934
  print(self.bundle.get("metrics_start"))
@@ -920,7 +941,7 @@ class FeaturesEnricher(TransformerMixin):
920
941
 
921
942
  self._check_train_and_eval_target_distribution(y_sorted, fitting_eval_set_dict)
922
943
 
923
- has_date = self._get_date_column(search_keys) is not None
944
+ has_date = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME]) is not None
924
945
  model_task_type = self.model_task_type or define_task(y_sorted, has_date, self.logger, silent=True)
925
946
 
926
947
  wrapper = EstimatorWrapper.create(
@@ -1013,7 +1034,7 @@ class FeaturesEnricher(TransformerMixin):
1013
1034
  self.bundle.get("quality_metrics_rows_header"): _num_samples(effective_X),
1014
1035
  }
1015
1036
  if model_task_type in [ModelTaskType.BINARY, ModelTaskType.REGRESSION] and is_numeric_dtype(
1016
- y_sorted
1037
+ effective_y
1017
1038
  ):
1018
1039
  train_metrics[self.bundle.get("quality_metrics_mean_target_header")] = round(
1019
1040
  np.mean(effective_y), 4
@@ -1086,7 +1107,7 @@ class FeaturesEnricher(TransformerMixin):
1086
1107
  # self.bundle.get("quality_metrics_match_rate_header"): eval_hit_rate,
1087
1108
  }
1088
1109
  if model_task_type in [ModelTaskType.BINARY, ModelTaskType.REGRESSION] and is_numeric_dtype(
1089
- eval_y_sorted
1110
+ effective_eval_set[idx][1]
1090
1111
  ):
1091
1112
  eval_metrics[self.bundle.get("quality_metrics_mean_target_header")] = round(
1092
1113
  np.mean(effective_eval_set[idx][1]), 4
@@ -1113,7 +1134,7 @@ class FeaturesEnricher(TransformerMixin):
1113
1134
  )
1114
1135
 
1115
1136
  uplift_col = self.bundle.get("quality_metrics_uplift_header")
1116
- date_column = self._get_date_column(search_keys)
1137
+ date_column = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
1117
1138
  if (
1118
1139
  uplift_col in metrics_df.columns
1119
1140
  and (metrics_df[uplift_col] < 0).any()
@@ -1195,27 +1216,6 @@ class FeaturesEnricher(TransformerMixin):
1195
1216
  def _has_paid_features(self, exclude_features_sources: Optional[List[str]]) -> bool:
1196
1217
  return self._has_features_with_commercial_schema(CommercialSchema.PAID.value, exclude_features_sources)
1197
1218
 
1198
- def _extend_x(self, x: pd.DataFrame, is_demo_dataset: bool) -> Tuple[pd.DataFrame, Dict[str, SearchKey]]:
1199
- search_keys = self.search_keys.copy()
1200
- search_keys = self.__prepare_search_keys(x, search_keys, is_demo_dataset, is_transform=True, silent_mode=True)
1201
-
1202
- extended_X = x.copy()
1203
- generated_features = []
1204
- date_column = self._get_date_column(search_keys)
1205
- if date_column is not None:
1206
- converter = DateTimeSearchKeyConverter(date_column, self.date_format, self.logger, self.bundle)
1207
- extended_X = converter.convert(extended_X, keep_time=True)
1208
- generated_features.extend(converter.generated_features)
1209
- email_column = self._get_email_column(search_keys)
1210
- hem_column = self._get_hem_column(search_keys)
1211
- if email_column:
1212
- converter = EmailSearchKeyConverter(email_column, hem_column, search_keys, [], self.logger)
1213
- extended_X = converter.convert(extended_X)
1214
- generated_features.extend(converter.generated_features)
1215
- generated_features = [f for f in generated_features if f in self.fit_generated_features]
1216
-
1217
- return extended_X, search_keys
1218
-
1219
1219
  def _is_input_same_as_fit(
1220
1220
  self,
1221
1221
  X: Union[pd.DataFrame, pd.Series, np.ndarray, None] = None,
@@ -1259,7 +1259,7 @@ class FeaturesEnricher(TransformerMixin):
1259
1259
  groups = None
1260
1260
 
1261
1261
  if not isinstance(_cv, BaseCrossValidator):
1262
- date_column = self._get_date_column(search_keys)
1262
+ date_column = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
1263
1263
  date_series = X[date_column] if date_column is not None else None
1264
1264
  _cv, groups = CVConfig(
1265
1265
  _cv, date_series, self.random_state, self._search_task.get_shuffle_kfold(), group_columns=group_columns
@@ -1282,7 +1282,7 @@ class FeaturesEnricher(TransformerMixin):
1282
1282
 
1283
1283
  def _get_client_cat_features(
1284
1284
  self, estimator: Optional[Any], X: pd.DataFrame, search_keys: Dict[str, SearchKey]
1285
- ) -> Optional[List[str]]:
1285
+ ) -> Tuple[Optional[List[str]], List[str]]:
1286
1286
  cat_features = None
1287
1287
  search_keys_for_metrics = []
1288
1288
  if (
@@ -1342,11 +1342,15 @@ class FeaturesEnricher(TransformerMixin):
1342
1342
  progress_bar,
1343
1343
  progress_callback,
1344
1344
  )
1345
- X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys = dataclasses.astuple(sampled_data)
1345
+ X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys, columns_renaming = dataclasses.astuple(
1346
+ sampled_data
1347
+ )
1346
1348
 
1347
1349
  excluding_search_keys = list(search_keys.keys())
1348
1350
  if search_keys_for_metrics is not None and len(search_keys_for_metrics) > 0:
1349
- excluding_search_keys = [sk for sk in excluding_search_keys if sk not in search_keys_for_metrics]
1351
+ for sk in excluding_search_keys:
1352
+ if columns_renaming.get(sk) in search_keys_for_metrics:
1353
+ excluding_search_keys.remove(sk)
1350
1354
 
1351
1355
  client_features = [
1352
1356
  c
@@ -1363,6 +1367,7 @@ class FeaturesEnricher(TransformerMixin):
1363
1367
  importance_threshold,
1364
1368
  max_features,
1365
1369
  )
1370
+ filtered_enriched_features = [c for c in filtered_enriched_features if c not in client_features]
1366
1371
 
1367
1372
  X_sorted, y_sorted = self._sort_by_system_record_id(X_sampled, y_sampled, self.cv)
1368
1373
  enriched_X_sorted, enriched_y_sorted = self._sort_by_system_record_id(enriched_X, y_sampled, self.cv)
@@ -1392,6 +1397,7 @@ class FeaturesEnricher(TransformerMixin):
1392
1397
  fitting_X = fitting_X.drop(columns=constant_columns, errors="ignore")
1393
1398
  fitting_enriched_X = fitting_enriched_X.drop(columns=constant_columns, errors="ignore")
1394
1399
 
1400
+ # TODO maybe there is no more need for these convertions
1395
1401
  # Remove datetime features
1396
1402
  datetime_features = [
1397
1403
  f for f in fitting_X.columns if is_datetime64_any_dtype(fitting_X[f]) or is_period_dtype(fitting_X[f])
@@ -1479,6 +1485,7 @@ class FeaturesEnricher(TransformerMixin):
1479
1485
  search_keys,
1480
1486
  groups,
1481
1487
  cv,
1488
+ columns_renaming,
1482
1489
  )
1483
1490
 
1484
1491
  @dataclass
@@ -1488,6 +1495,7 @@ class FeaturesEnricher(TransformerMixin):
1488
1495
  enriched_X: pd.DataFrame
1489
1496
  eval_set_sampled_dict: Dict[int, Tuple[pd.DataFrame, pd.Series]]
1490
1497
  search_keys: Dict[str, SearchKey]
1498
+ columns_renaming: Dict[str, str]
1491
1499
 
1492
1500
  def _sample_data_for_metrics(
1493
1501
  self,
@@ -1527,11 +1535,15 @@ class FeaturesEnricher(TransformerMixin):
1527
1535
  )
1528
1536
 
1529
1537
  def __get_sampled_cached_enriched(self, exclude_features_sources: Optional[List[str]]) -> _SampledDataForMetrics:
1530
- X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys = self.__cached_sampled_datasets
1538
+ X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys, columns_renaming = (
1539
+ self.__cached_sampled_datasets
1540
+ )
1531
1541
  if exclude_features_sources:
1532
1542
  enriched_X = enriched_X.drop(columns=exclude_features_sources, errors="ignore")
1533
1543
 
1534
- return self.__mk_sampled_data_tuple(X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys)
1544
+ return self.__mk_sampled_data_tuple(
1545
+ X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys, columns_renaming
1546
+ )
1535
1547
 
1536
1548
  def __sample_only_input(
1537
1549
  self, validated_X: pd.DataFrame, validated_y: pd.Series, eval_set: Optional[List[tuple]], is_demo_dataset: bool
@@ -1549,6 +1561,28 @@ class FeaturesEnricher(TransformerMixin):
1549
1561
  eval_xy[EVAL_SET_INDEX] = idx + 1
1550
1562
  df = pd.concat([df, eval_xy])
1551
1563
 
1564
+ search_keys = self.search_keys.copy()
1565
+ search_keys = self.__prepare_search_keys(df, search_keys, is_demo_dataset, is_transform=True, silent_mode=True)
1566
+
1567
+ date_column = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
1568
+ generated_features = []
1569
+ if date_column is not None:
1570
+ converter = DateTimeSearchKeyConverter(
1571
+ date_column, self.date_format, self.logger, self.bundle, silent_mode=True
1572
+ )
1573
+ df = converter.convert(df, keep_time=True)
1574
+ generated_features = converter.generated_features
1575
+
1576
+ email_columns = SearchKey.find_all_keys(search_keys, SearchKey.EMAIL)
1577
+ if email_columns:
1578
+ generator = EmailDomainGenerator(email_columns)
1579
+ df = generator.generate(df)
1580
+ generated_features.extend(generator.generated_features)
1581
+
1582
+ normalizer = Normalizer(self.search_keys, generated_features, self.bundle, self.logger, self.warning_counter)
1583
+ df = normalizer.normalize(df)
1584
+ columns_renaming = normalizer.columns_renaming
1585
+
1552
1586
  df = clean_full_duplicates(df, logger=self.logger, silent=True, bundle=self.bundle)
1553
1587
 
1554
1588
  num_samples = _num_samples(df)
@@ -1561,24 +1595,34 @@ class FeaturesEnricher(TransformerMixin):
1561
1595
  self.logger.info(f"Downsampling from {num_samples} to {sample_rows}")
1562
1596
  df = df.sample(n=sample_rows, random_state=self.random_state)
1563
1597
 
1564
- df_extended, search_keys = self._extend_x(df, is_demo_dataset)
1565
- df_extended = self.__add_fit_system_record_id(df_extended, {}, search_keys, SYSTEM_RECORD_ID)
1598
+ df = self.__add_fit_system_record_id(df, search_keys, SYSTEM_RECORD_ID)
1599
+ if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
1600
+ df = df.drop(columns=DateTimeSearchKeyConverter.DATETIME_COL)
1566
1601
 
1567
- train_df = df_extended.query(f"{EVAL_SET_INDEX} == 0") if eval_set is not None else df_extended
1602
+ train_df = df.query(f"{EVAL_SET_INDEX} == 0") if eval_set is not None else df
1568
1603
  X_sampled = train_df.drop(columns=[TARGET, EVAL_SET_INDEX], errors="ignore")
1569
1604
  y_sampled = train_df[TARGET].copy()
1570
1605
  enriched_X = X_sampled
1571
1606
 
1572
1607
  if eval_set is not None:
1573
1608
  for idx in range(len(eval_set)):
1574
- eval_xy_sampled = df_extended.query(f"{EVAL_SET_INDEX} == {idx + 1}")
1609
+ eval_xy_sampled = df.query(f"{EVAL_SET_INDEX} == {idx + 1}")
1575
1610
  eval_X_sampled = eval_xy_sampled.drop(columns=[TARGET, EVAL_SET_INDEX], errors="ignore")
1576
1611
  eval_y_sampled = eval_xy_sampled[TARGET].copy()
1577
1612
  enriched_eval_X = eval_X_sampled
1578
1613
  eval_set_sampled_dict[idx] = (eval_X_sampled, enriched_eval_X, eval_y_sampled)
1579
- self.__cached_sampled_datasets = (X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys)
1614
+ self.__cached_sampled_datasets = (
1615
+ X_sampled,
1616
+ y_sampled,
1617
+ enriched_X,
1618
+ eval_set_sampled_dict,
1619
+ search_keys,
1620
+ columns_renaming,
1621
+ )
1580
1622
 
1581
- return self.__mk_sampled_data_tuple(X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys)
1623
+ return self.__mk_sampled_data_tuple(
1624
+ X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys, columns_renaming
1625
+ )
1582
1626
 
1583
1627
  def __sample_balanced(
1584
1628
  self,
@@ -1590,7 +1634,7 @@ class FeaturesEnricher(TransformerMixin):
1590
1634
  search_keys = self.fit_search_keys
1591
1635
 
1592
1636
  rows_to_drop = None
1593
- has_date = self._get_date_column(search_keys) is not None
1637
+ has_date = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME]) is not None
1594
1638
  task_type = self.model_task_type or define_task(
1595
1639
  self.df_with_original_index[TARGET], has_date, self.logger, silent=True
1596
1640
  )
@@ -1644,9 +1688,18 @@ class FeaturesEnricher(TransformerMixin):
1644
1688
  enriched_eval_X = enriched_eval_sets[idx + 1].drop(columns=[TARGET, EVAL_SET_INDEX])
1645
1689
  eval_set_sampled_dict[idx] = (eval_X_sampled, enriched_eval_X, eval_y_sampled)
1646
1690
 
1647
- self.__cached_sampled_datasets = (X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys)
1691
+ self.__cached_sampled_datasets = (
1692
+ X_sampled,
1693
+ y_sampled,
1694
+ enriched_X,
1695
+ eval_set_sampled_dict,
1696
+ search_keys,
1697
+ self.fit_columns_renaming,
1698
+ )
1648
1699
 
1649
- return self.__mk_sampled_data_tuple(X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys)
1700
+ return self.__mk_sampled_data_tuple(
1701
+ X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys, self.fit_columns_renaming
1702
+ )
1650
1703
 
1651
1704
  def __sample_imbalanced(
1652
1705
  self,
@@ -1686,7 +1739,7 @@ class FeaturesEnricher(TransformerMixin):
1686
1739
  tmp_target_name = "__target"
1687
1740
  df = df.rename(columns={TARGET: tmp_target_name})
1688
1741
 
1689
- enriched_df = self.__inner_transform(
1742
+ enriched_df, columns_renaming = self.__inner_transform(
1690
1743
  trace_id,
1691
1744
  df,
1692
1745
  exclude_features_sources=exclude_features_sources,
@@ -1734,7 +1787,7 @@ class FeaturesEnricher(TransformerMixin):
1734
1787
  tmp_target_name = "__target"
1735
1788
  df = df.rename(columns={TARGET: tmp_target_name})
1736
1789
 
1737
- enriched_Xy = self.__inner_transform(
1790
+ enriched_Xy, columns_renaming = self.__inner_transform(
1738
1791
  trace_id,
1739
1792
  df,
1740
1793
  exclude_features_sources=exclude_features_sources,
@@ -1759,9 +1812,18 @@ class FeaturesEnricher(TransformerMixin):
1759
1812
  y_sampled = enriched_Xy[TARGET].copy()
1760
1813
  enriched_X = enriched_Xy.drop(columns=TARGET)
1761
1814
 
1762
- self.__cached_sampled_datasets = (X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, self.search_keys)
1815
+ self.__cached_sampled_datasets = (
1816
+ X_sampled,
1817
+ y_sampled,
1818
+ enriched_X,
1819
+ eval_set_sampled_dict,
1820
+ self.search_keys,
1821
+ columns_renaming,
1822
+ )
1763
1823
 
1764
- return self.__mk_sampled_data_tuple(X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, self.search_keys)
1824
+ return self.__mk_sampled_data_tuple(
1825
+ X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, self.search_keys, columns_renaming
1826
+ )
1765
1827
 
1766
1828
  def __mk_sampled_data_tuple(
1767
1829
  self,
@@ -1770,6 +1832,7 @@ class FeaturesEnricher(TransformerMixin):
1770
1832
  enriched_X: pd.DataFrame,
1771
1833
  eval_set_sampled_dict: Dict,
1772
1834
  search_keys: Dict,
1835
+ columns_renaming: Dict[str, str],
1773
1836
  ):
1774
1837
  search_keys = {k: v for k, v in search_keys.items() if k in X_sampled.columns.to_list()}
1775
1838
  return FeaturesEnricher._SampledDataForMetrics(
@@ -1778,6 +1841,7 @@ class FeaturesEnricher(TransformerMixin):
1778
1841
  enriched_X=enriched_X,
1779
1842
  eval_set_sampled_dict=eval_set_sampled_dict,
1780
1843
  search_keys=search_keys,
1844
+ columns_renaming=columns_renaming,
1781
1845
  )
1782
1846
 
1783
1847
  def get_search_id(self) -> Optional[str]:
@@ -1866,7 +1930,7 @@ class FeaturesEnricher(TransformerMixin):
1866
1930
  progress_bar: Optional[ProgressBar] = None,
1867
1931
  progress_callback: Optional[Callable[[SearchProgress], Any]] = None,
1868
1932
  add_fit_system_record_id: bool = False,
1869
- ) -> pd.DataFrame:
1933
+ ) -> Tuple[pd.DataFrame, Dict[str, str]]:
1870
1934
  if self._search_task is None:
1871
1935
  raise NotFittedError(self.bundle.get("transform_unfitted_enricher"))
1872
1936
 
@@ -1879,13 +1943,13 @@ class FeaturesEnricher(TransformerMixin):
1879
1943
 
1880
1944
  if len(self.feature_names_) == 0:
1881
1945
  self.logger.warning(self.bundle.get("no_important_features_for_transform"))
1882
- return X
1946
+ return X, {c: c for c in X.columns}
1883
1947
 
1884
1948
  if self._has_paid_features(exclude_features_sources):
1885
1949
  msg = self.bundle.get("transform_with_paid_features")
1886
1950
  self.logger.warning(msg)
1887
1951
  self.__display_support_link(msg)
1888
- return None
1952
+ return None, {c: c for c in X.columns}
1889
1953
 
1890
1954
  if not metrics_calculation:
1891
1955
  transform_usage = self.rest_client.get_current_transform_usage(trace_id)
@@ -1896,7 +1960,7 @@ class FeaturesEnricher(TransformerMixin):
1896
1960
  self.logger.warning(msg)
1897
1961
  print(msg)
1898
1962
  show_request_quote_button()
1899
- return None
1963
+ return None, {c: c for c in X.columns}
1900
1964
  else:
1901
1965
  msg = self.bundle.get("transform_usage_info").format(
1902
1966
  transform_usage.limit, transform_usage.transformed_rows
@@ -1934,9 +1998,11 @@ class FeaturesEnricher(TransformerMixin):
1934
1998
  df = self.__add_country_code(df, search_keys)
1935
1999
 
1936
2000
  generated_features = []
1937
- date_column = self._get_date_column(search_keys)
2001
+ date_column = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
1938
2002
  if date_column is not None:
1939
- converter = DateTimeSearchKeyConverter(date_column, self.date_format, self.logger, bundle=self.bundle)
2003
+ converter = DateTimeSearchKeyConverter(
2004
+ date_column, self.date_format, self.logger, bundle=self.bundle, silent_mode=silent_mode
2005
+ )
1940
2006
  df = converter.convert(df)
1941
2007
  self.logger.info(f"Date column after convertion: {df[date_column]}")
1942
2008
  generated_features.extend(converter.generated_features)
@@ -1945,61 +2011,93 @@ class FeaturesEnricher(TransformerMixin):
1945
2011
  if self.add_date_if_missing:
1946
2012
  df = self._add_current_date_as_key(df, search_keys, self.logger, self.bundle)
1947
2013
 
2014
+ email_columns = SearchKey.find_all_keys(search_keys, SearchKey.EMAIL)
2015
+ if email_columns:
2016
+ generator = EmailDomainGenerator(email_columns)
2017
+ df = generator.generate(df)
2018
+ generated_features.extend(generator.generated_features)
2019
+
2020
+ normalizer = Normalizer(
2021
+ search_keys, generated_features, self.bundle, self.logger, self.warning_counter, silent_mode
2022
+ )
2023
+ df = normalizer.normalize(df)
2024
+ columns_renaming = normalizer.columns_renaming
2025
+
1948
2026
  # Don't pass all features in backend on transform
1949
- original_features_for_transform = []
1950
2027
  runtime_parameters = self._get_copy_of_runtime_parameters()
1951
- features_not_to_pass = [column for column in df.columns if column not in search_keys.keys()]
1952
- if len(features_not_to_pass) > 0:
1953
- # Pass only features that need for transform
1954
- features_for_transform = self._search_task.get_features_for_transform()
1955
- if features_for_transform is not None and len(features_for_transform) > 0:
1956
- file_metadata = self._search_task.get_file_metadata(trace_id)
1957
- original_features_for_transform = [
1958
- c.originalName or c.name for c in file_metadata.columns if c.name in features_for_transform
1959
- ]
1960
-
1961
- runtime_parameters.properties["features_for_embeddings"] = ",".join(features_for_transform)
2028
+ features_for_transform = self._search_task.get_features_for_transform() or []
2029
+ if len(features_for_transform) > 0:
2030
+ runtime_parameters.properties["features_for_embeddings"] = ",".join(features_for_transform)
1962
2031
 
1963
- columns_for_system_record_id = sorted(list(search_keys.keys()) + (original_features_for_transform))
2032
+ columns_for_system_record_id = sorted(list(search_keys.keys()) + features_for_transform)
1964
2033
 
1965
2034
  df[ENTITY_SYSTEM_RECORD_ID] = pd.util.hash_pandas_object(
1966
2035
  df[columns_for_system_record_id], index=False
1967
2036
  ).astype("Float64")
1968
2037
 
1969
2038
  # Explode multiple search keys
1970
- df, unnest_search_keys = self._explode_multiple_search_keys(df, search_keys)
2039
+ df, unnest_search_keys = self._explode_multiple_search_keys(df, search_keys, columns_renaming)
1971
2040
 
1972
2041
  email_column = self._get_email_column(search_keys)
1973
2042
  hem_column = self._get_hem_column(search_keys)
1974
- email_converted_to_hem = False
1975
2043
  if email_column:
1976
2044
  converter = EmailSearchKeyConverter(
1977
- email_column, hem_column, search_keys, list(unnest_search_keys.keys()), self.logger
2045
+ email_column,
2046
+ hem_column,
2047
+ search_keys,
2048
+ columns_renaming,
2049
+ list(unnest_search_keys.keys()),
2050
+ self.logger,
1978
2051
  )
1979
2052
  df = converter.convert(df)
1980
- generated_features.extend(converter.generated_features)
1981
- email_converted_to_hem = converter.email_converted_to_hem
2053
+
2054
+ ip_column = self._get_ip_column(search_keys)
2055
+ if ip_column:
2056
+ converter = IpSearchKeyConverter(
2057
+ ip_column,
2058
+ search_keys,
2059
+ columns_renaming,
2060
+ list(unnest_search_keys.keys()),
2061
+ self.bundle,
2062
+ self.logger,
2063
+ )
2064
+ df = converter.convert(df)
2065
+
2066
+ phone_column = self._get_phone_column(search_keys)
2067
+ country_column = self._get_country_column(search_keys)
2068
+ if phone_column:
2069
+ converter = PhoneSearchKeyConverter(phone_column, country_column)
2070
+ df = converter.convert(df)
2071
+
2072
+ if country_column:
2073
+ converter = CountrySearchKeyConverter(country_column)
2074
+ df = converter.convert(df)
2075
+
2076
+ postal_code = self._get_postal_column(search_keys)
2077
+ if postal_code:
2078
+ converter = PostalCodeSearchKeyConverter(postal_code)
2079
+ df = converter.convert(df)
2080
+
1982
2081
  generated_features = [f for f in generated_features if f in self.fit_generated_features]
1983
2082
 
1984
2083
  meaning_types = {col: key.value for col, key in search_keys.items()}
1985
- # non_keys_columns = [column for column in df.columns if column not in search_keys.keys()]
1986
- for col in original_features_for_transform:
2084
+ for col in features_for_transform:
1987
2085
  meaning_types[col] = FileColumnMeaningType.FEATURE
1988
- features_not_to_pass = [column for column in features_not_to_pass if column not in search_keys.keys()]
1989
-
1990
- if email_converted_to_hem:
1991
- features_not_to_pass.append(email_column)
1992
-
1993
- features_not_to_pass = [c for c in features_not_to_pass if c not in original_features_for_transform]
1994
- columns_for_system_record_id = sorted(list(search_keys.keys()) + (original_features_for_transform))
2086
+ features_not_to_pass = [
2087
+ c
2088
+ for c in df.columns
2089
+ if c not in search_keys.keys() and c not in features_for_transform and c != ENTITY_SYSTEM_RECORD_ID
2090
+ ]
1995
2091
 
1996
2092
  if add_fit_system_record_id:
1997
- df = self.__add_fit_system_record_id(df, {}, search_keys, SYSTEM_RECORD_ID)
2093
+ df = self.__add_fit_system_record_id(df, search_keys, SYSTEM_RECORD_ID)
2094
+ if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
2095
+ df = df.drop(columns=DateTimeSearchKeyConverter.DATETIME_COL)
1998
2096
  df = df.rename(columns={SYSTEM_RECORD_ID: SORT_ID})
1999
2097
  features_not_to_pass.append(SORT_ID)
2000
2098
 
2001
- columns_for_system_record_id = sorted(list(search_keys.keys()) + (original_features_for_transform or []))
2002
-
2099
+ # search keys might be changed after explode
2100
+ columns_for_system_record_id = sorted(list(search_keys.keys()) + features_for_transform)
2003
2101
  df[SYSTEM_RECORD_ID] = pd.util.hash_pandas_object(df[columns_for_system_record_id], index=False).astype(
2004
2102
  "Float64"
2005
2103
  )
@@ -2035,8 +2133,7 @@ class FeaturesEnricher(TransformerMixin):
2035
2133
  rest_client=self.rest_client,
2036
2134
  logger=self.logger,
2037
2135
  )
2038
- if email_converted_to_hem:
2039
- dataset.ignore_columns = [email_column]
2136
+ dataset.columns_renaming = columns_renaming
2040
2137
 
2041
2138
  if max_features is not None or importance_threshold is not None:
2042
2139
  exclude_features_sources = list(
@@ -2125,7 +2222,9 @@ class FeaturesEnricher(TransformerMixin):
2125
2222
  result = enrich()
2126
2223
 
2127
2224
  filtered_columns = self.__filtered_enriched_features(importance_threshold, max_features)
2128
- existing_filtered_columns = [c for c in filtered_columns if c in result.columns]
2225
+ existing_filtered_columns = [
2226
+ c for c in filtered_columns if c in result.columns and c not in validated_X.columns
2227
+ ]
2129
2228
  selecting_columns = validated_X.columns.tolist() + generated_features + existing_filtered_columns
2130
2229
  if add_fit_system_record_id:
2131
2230
  selecting_columns.append(SORT_ID)
@@ -2138,7 +2237,7 @@ class FeaturesEnricher(TransformerMixin):
2138
2237
  if add_fit_system_record_id:
2139
2238
  result = result.rename(columns={SORT_ID: SYSTEM_RECORD_ID})
2140
2239
 
2141
- return result
2240
+ return result, columns_renaming
2142
2241
 
2143
2242
  def _get_excluded_features(self, max_features: Optional[int], importance_threshold: Optional[float]) -> List[str]:
2144
2243
  features_info = self._internal_features_info
@@ -2239,6 +2338,9 @@ class FeaturesEnricher(TransformerMixin):
2239
2338
  self.df_with_original_index = None
2240
2339
  self.__cached_sampled_datasets = None
2241
2340
  self.metrics = None
2341
+ self.fit_columns_renaming = None
2342
+ self.fit_dropped_features = set()
2343
+ self.fit_generated_features = []
2242
2344
 
2243
2345
  validated_X = self._validate_X(X)
2244
2346
  validated_y = self._validate_y(validated_X, y)
@@ -2285,9 +2387,10 @@ class FeaturesEnricher(TransformerMixin):
2285
2387
  self.fit_search_keys = self.search_keys.copy()
2286
2388
  self.fit_search_keys = self.__prepare_search_keys(validated_X, self.fit_search_keys, is_demo_dataset)
2287
2389
 
2288
- maybe_date_column = self._get_date_column(self.fit_search_keys)
2390
+ maybe_date_column = SearchKey.find_key(self.fit_search_keys, [SearchKey.DATE, SearchKey.DATETIME])
2289
2391
  has_date = maybe_date_column is not None
2290
2392
  model_task_type = self.model_task_type or define_task(validated_y, has_date, self.logger)
2393
+
2291
2394
  self._validate_binary_observations(validated_y, model_task_type)
2292
2395
 
2293
2396
  self.runtime_parameters = get_runtime_params_custom_loss(
@@ -2317,7 +2420,13 @@ class FeaturesEnricher(TransformerMixin):
2317
2420
  self.fit_generated_features = []
2318
2421
 
2319
2422
  if has_date:
2320
- converter = DateTimeSearchKeyConverter(maybe_date_column, self.date_format, self.logger, bundle=self.bundle)
2423
+ converter = DateTimeSearchKeyConverter(
2424
+ maybe_date_column,
2425
+ self.date_format,
2426
+ self.logger,
2427
+ bundle=self.bundle,
2428
+ warnings_counter=self.warning_counter,
2429
+ )
2321
2430
  df = converter.convert(df, keep_time=True)
2322
2431
  self.logger.info(f"Date column after convertion: {df[maybe_date_column]}")
2323
2432
  self.fit_generated_features.extend(converter.generated_features)
@@ -2326,6 +2435,12 @@ class FeaturesEnricher(TransformerMixin):
2326
2435
  if self.add_date_if_missing:
2327
2436
  df = self._add_current_date_as_key(df, self.fit_search_keys, self.logger, self.bundle)
2328
2437
 
2438
+ email_columns = SearchKey.find_all_keys(self.fit_search_keys, SearchKey.EMAIL)
2439
+ if email_columns:
2440
+ generator = EmailDomainGenerator(email_columns)
2441
+ df = generator.generate(df)
2442
+ self.fit_generated_features.extend(generator.generated_features)
2443
+
2329
2444
  # Checks that need validated date
2330
2445
  validate_dates_distribution(df, self.fit_search_keys, self.logger, self.bundle, self.warning_counter)
2331
2446
 
@@ -2334,7 +2449,12 @@ class FeaturesEnricher(TransformerMixin):
2334
2449
 
2335
2450
  self.__adjust_cv(df, maybe_date_column, model_task_type)
2336
2451
 
2337
- # TODO normalize and convert all columns
2452
+ normalizer = Normalizer(
2453
+ self.fit_search_keys, self.fit_generated_features, self.bundle, self.logger, self.warning_counter
2454
+ )
2455
+ df = normalizer.normalize(df)
2456
+ columns_renaming = normalizer.columns_renaming
2457
+ self.fit_columns_renaming = columns_renaming
2338
2458
 
2339
2459
  df = remove_fintech_duplicates(
2340
2460
  df, self.fit_search_keys, date_format=self.date_format, logger=self.logger, bundle=self.bundle
@@ -2342,38 +2462,58 @@ class FeaturesEnricher(TransformerMixin):
2342
2462
  df = clean_full_duplicates(df, self.logger, bundle=self.bundle)
2343
2463
 
2344
2464
  # Explode multiple search keys
2345
- non_feature_columns = [self.TARGET_NAME, EVAL_SET_INDEX] + list(self.fit_search_keys.keys())
2346
- meaning_types = {
2347
- **{col: key.value for col, key in self.fit_search_keys.items()},
2348
- **{str(c): FileColumnMeaningType.FEATURE for c in df.columns if c not in non_feature_columns},
2349
- }
2350
- meaning_types[self.TARGET_NAME] = FileColumnMeaningType.TARGET
2351
- if eval_set is not None and len(eval_set) > 0:
2352
- meaning_types[EVAL_SET_INDEX] = FileColumnMeaningType.EVAL_SET_INDEX
2353
- df = self.__add_fit_system_record_id(df, meaning_types, self.fit_search_keys, ENTITY_SYSTEM_RECORD_ID)
2465
+ df = self.__add_fit_system_record_id(df, self.fit_search_keys, ENTITY_SYSTEM_RECORD_ID)
2354
2466
 
2355
2467
  # TODO check that this is correct for enrichment
2356
2468
  self.df_with_original_index = df.copy()
2469
+ # TODO check maybe need to drop _time column from df_with_original_index
2357
2470
 
2358
- df, unnest_search_keys = self._explode_multiple_search_keys(df, self.fit_search_keys)
2471
+ df, unnest_search_keys = self._explode_multiple_search_keys(df, self.fit_search_keys, columns_renaming)
2359
2472
 
2360
2473
  # Convert EMAIL to HEM after unnesting to do it only with one column
2361
2474
  email_column = self._get_email_column(self.fit_search_keys)
2362
2475
  hem_column = self._get_hem_column(self.fit_search_keys)
2363
- email_converted_to_hem = False
2364
2476
  if email_column:
2365
2477
  converter = EmailSearchKeyConverter(
2366
- email_column, hem_column, self.fit_search_keys, list(unnest_search_keys.keys()), self.logger
2478
+ email_column,
2479
+ hem_column,
2480
+ self.fit_search_keys,
2481
+ columns_renaming,
2482
+ list(unnest_search_keys.keys()),
2483
+ self.logger,
2367
2484
  )
2368
2485
  df = converter.convert(df)
2369
- self.fit_generated_features.extend(converter.generated_features)
2370
- email_converted_to_hem = converter.email_converted_to_hem
2486
+
2487
+ ip_column = self._get_ip_column(self.fit_search_keys)
2488
+ if ip_column:
2489
+ converter = IpSearchKeyConverter(
2490
+ ip_column,
2491
+ self.fit_search_keys,
2492
+ columns_renaming,
2493
+ list(unnest_search_keys.keys()),
2494
+ self.bundle,
2495
+ self.logger,
2496
+ )
2497
+ df = converter.convert(df)
2498
+
2499
+ phone_column = self._get_phone_column(self.fit_search_keys)
2500
+ country_column = self._get_country_column(self.fit_search_keys)
2501
+ if phone_column:
2502
+ converter = PhoneSearchKeyConverter(phone_column, country_column)
2503
+ df = converter.convert(df)
2504
+
2505
+ if country_column:
2506
+ converter = CountrySearchKeyConverter(country_column)
2507
+ df = converter.convert(df)
2508
+
2509
+ postal_code = self._get_postal_column(self.fit_search_keys)
2510
+ if postal_code:
2511
+ converter = PostalCodeSearchKeyConverter(postal_code)
2512
+ df = converter.convert(df)
2371
2513
 
2372
2514
  non_feature_columns = [self.TARGET_NAME, EVAL_SET_INDEX, ENTITY_SYSTEM_RECORD_ID, SEARCH_KEY_UNNEST] + list(
2373
2515
  self.fit_search_keys.keys()
2374
2516
  )
2375
- if email_converted_to_hem:
2376
- non_feature_columns.append(email_column)
2377
2517
  if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
2378
2518
  non_feature_columns.append(DateTimeSearchKeyConverter.DATETIME_COL)
2379
2519
 
@@ -2385,9 +2525,6 @@ class FeaturesEnricher(TransformerMixin):
2385
2525
  self.fit_dropped_features.update(features_to_drop)
2386
2526
  df = df.drop(columns=features_to_drop)
2387
2527
 
2388
- if email_converted_to_hem:
2389
- self.fit_dropped_features.add(email_column)
2390
-
2391
2528
  self.fit_generated_features = [f for f in self.fit_generated_features if f not in self.fit_dropped_features]
2392
2529
 
2393
2530
  meaning_types = {
@@ -2401,7 +2538,12 @@ class FeaturesEnricher(TransformerMixin):
2401
2538
  if eval_set is not None and len(eval_set) > 0:
2402
2539
  meaning_types[EVAL_SET_INDEX] = FileColumnMeaningType.EVAL_SET_INDEX
2403
2540
 
2404
- df = self.__add_fit_system_record_id(df, meaning_types, self.fit_search_keys, SYSTEM_RECORD_ID)
2541
+ df = self.__add_fit_system_record_id(df, self.fit_search_keys, SYSTEM_RECORD_ID)
2542
+
2543
+ if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
2544
+ df = df.drop(columns=DateTimeSearchKeyConverter.DATETIME_COL)
2545
+
2546
+ meaning_types[SYSTEM_RECORD_ID] = FileColumnMeaningType.SYSTEM_RECORD_ID
2405
2547
 
2406
2548
  df = df.reset_index(drop=True).sort_values(by=SYSTEM_RECORD_ID).reset_index(drop=True)
2407
2549
 
@@ -2419,8 +2561,7 @@ class FeaturesEnricher(TransformerMixin):
2419
2561
  rest_client=self.rest_client,
2420
2562
  logger=self.logger,
2421
2563
  )
2422
- if email_converted_to_hem:
2423
- dataset.ignore_columns = [email_column]
2564
+ dataset.columns_renaming = columns_renaming
2424
2565
 
2425
2566
  self.passed_features = [
2426
2567
  column for column, meaning_type in meaning_types.items() if meaning_type == FileColumnMeaningType.FEATURE
@@ -2809,7 +2950,7 @@ class FeaturesEnricher(TransformerMixin):
2809
2950
  if DateTimeSearchKeyConverter.DATETIME_COL in X.columns:
2810
2951
  date_column = DateTimeSearchKeyConverter.DATETIME_COL
2811
2952
  else:
2812
- date_column = FeaturesEnricher._get_date_column(search_keys)
2953
+ date_column = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
2813
2954
  sort_columns = [date_column] if date_column is not None else []
2814
2955
 
2815
2956
  # Xy = pd.concat([X, y], axis=1)
@@ -2905,10 +3046,10 @@ class FeaturesEnricher(TransformerMixin):
2905
3046
 
2906
3047
  do_without_pandas_limits(print_datasets_sample)
2907
3048
 
2908
- maybe_date_col = self._get_date_column(self.search_keys)
3049
+ maybe_date_col = SearchKey.find_key(self.search_keys, [SearchKey.DATE, SearchKey.DATETIME])
2909
3050
  if X is not None and maybe_date_col is not None and maybe_date_col in X.columns:
2910
3051
  # TODO cast date column to single dtype
2911
- date_converter = DateTimeSearchKeyConverter(maybe_date_col, self.date_format)
3052
+ date_converter = DateTimeSearchKeyConverter(maybe_date_col, self.date_format, silent_mode=True)
2912
3053
  converted_X = date_converter.convert(X)
2913
3054
  min_date = converted_X[maybe_date_col].min()
2914
3055
  max_date = converted_X[maybe_date_col].max()
@@ -2935,12 +3076,6 @@ class FeaturesEnricher(TransformerMixin):
2935
3076
 
2936
3077
  return df
2937
3078
 
2938
- @staticmethod
2939
- def _get_date_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
2940
- for col, t in search_keys.items():
2941
- if t in [SearchKey.DATE, SearchKey.DATETIME]:
2942
- return col
2943
-
2944
3079
  @staticmethod
2945
3080
  def _add_current_date_as_key(
2946
3081
  df: pd.DataFrame, search_keys: Dict[str, SearchKey], logger: logging.Logger, bundle: ResourceBundle
@@ -2956,7 +3091,7 @@ class FeaturesEnricher(TransformerMixin):
2956
3091
  logger.warning(msg)
2957
3092
  df[FeaturesEnricher.CURRENT_DATE] = datetime.date.today()
2958
3093
  search_keys[FeaturesEnricher.CURRENT_DATE] = SearchKey.DATE
2959
- converter = DateTimeSearchKeyConverter(FeaturesEnricher.CURRENT_DATE, None, logger, bundle)
3094
+ converter = DateTimeSearchKeyConverter(FeaturesEnricher.CURRENT_DATE, silent_mode=True)
2960
3095
  df = converter.convert(df)
2961
3096
  return df
2962
3097
 
@@ -2984,17 +3119,37 @@ class FeaturesEnricher(TransformerMixin):
2984
3119
  if len(cols) == 1:
2985
3120
  return cols[0]
2986
3121
 
3122
+ @staticmethod
3123
+ def _get_ip_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
3124
+ cols = [col for col, t in search_keys.items() if t == SearchKey.IP]
3125
+ if len(cols) > 1:
3126
+ raise Exception("More than one ip column found after unnest")
3127
+ if len(cols) == 1:
3128
+ return cols[0]
3129
+
2987
3130
  @staticmethod
2988
3131
  def _get_phone_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
2989
3132
  for col, t in search_keys.items():
2990
3133
  if t == SearchKey.PHONE:
2991
3134
  return col
2992
3135
 
3136
+ @staticmethod
3137
+ def _get_country_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
3138
+ for col, t in search_keys.items():
3139
+ if t == SearchKey.COUNTRY:
3140
+ return col
3141
+
3142
+ @staticmethod
3143
+ def _get_postal_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
3144
+ for col, t in search_keys.items():
3145
+ if t == SearchKey.POSTAL_CODE:
3146
+ return col
3147
+
2993
3148
  def _explode_multiple_search_keys(
2994
- self, df: pd.DataFrame, search_keys: Dict[str, SearchKey]
3149
+ self, df: pd.DataFrame, search_keys: Dict[str, SearchKey], columns_renaming: Dict[str, str]
2995
3150
  ) -> Tuple[pd.DataFrame, Dict[str, List[str]]]:
2996
3151
  # find groups of multiple search keys
2997
- search_key_names_by_type: Dict[SearchKey, str] = {}
3152
+ search_key_names_by_type: Dict[SearchKey, List[str]] = {}
2998
3153
  for key_name, key_type in search_keys.items():
2999
3154
  search_key_names_by_type[key_type] = search_key_names_by_type.get(key_type, []) + [key_name]
3000
3155
  search_key_names_by_type = {
@@ -3018,6 +3173,7 @@ class FeaturesEnricher(TransformerMixin):
3018
3173
  del search_keys[old_key]
3019
3174
  search_keys[new_search_key] = key_type
3020
3175
  unnest_search_keys[new_search_key] = key_names
3176
+ columns_renaming[new_search_key] = new_search_key
3021
3177
 
3022
3178
  df = pd.concat(exploded_dfs, ignore_index=True)
3023
3179
  return df, unnest_search_keys
@@ -3025,7 +3181,7 @@ class FeaturesEnricher(TransformerMixin):
3025
3181
  def __add_fit_system_record_id(
3026
3182
  self,
3027
3183
  df: pd.DataFrame,
3028
- meaning_types: Dict[str, FileColumnMeaningType],
3184
+ # meaning_types: Dict[str, FileColumnMeaningType],
3029
3185
  search_keys: Dict[str, SearchKey],
3030
3186
  id_name: str,
3031
3187
  ) -> pd.DataFrame:
@@ -3048,9 +3204,9 @@ class FeaturesEnricher(TransformerMixin):
3048
3204
  ]
3049
3205
  if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
3050
3206
  date_column = DateTimeSearchKeyConverter.DATETIME_COL
3051
- sort_exclude_columns.append(self._get_date_column(search_keys))
3207
+ sort_exclude_columns.append(SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME]))
3052
3208
  else:
3053
- date_column = self._get_date_column(search_keys)
3209
+ date_column = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
3054
3210
  sort_columns = [date_column] if date_column is not None else []
3055
3211
 
3056
3212
  other_columns = sorted(
@@ -3059,13 +3215,6 @@ class FeaturesEnricher(TransformerMixin):
3059
3215
  for c in df.columns
3060
3216
  if c not in sort_columns and c not in sort_exclude_columns and df[c].nunique() > 1
3061
3217
  ]
3062
- # [
3063
- # sk
3064
- # for sk, key_type in search_keys.items()
3065
- # if key_type not in [SearchKey.DATE, SearchKey.DATETIME]
3066
- # and sk in df.columns
3067
- # and df[sk].nunique() > 1 # don't use constant keys for hash
3068
- # ]
3069
3218
  )
3070
3219
 
3071
3220
  search_keys_hash = "search_keys_hash"
@@ -3078,9 +3227,6 @@ class FeaturesEnricher(TransformerMixin):
3078
3227
  if search_keys_hash in df.columns:
3079
3228
  df.drop(columns=search_keys_hash, inplace=True)
3080
3229
 
3081
- if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
3082
- df.drop(columns=DateTimeSearchKeyConverter.DATETIME_COL, inplace=True)
3083
-
3084
3230
  df = df.reset_index(drop=True).reset_index()
3085
3231
  # system_record_id saves correct order for fit
3086
3232
  df = df.rename(columns={DEFAULT_INDEX: id_name})
@@ -3090,11 +3236,11 @@ class FeaturesEnricher(TransformerMixin):
3090
3236
  df.index.name = original_index_name
3091
3237
  df = df.sort_values(by=original_order_name).drop(columns=original_order_name)
3092
3238
 
3093
- meaning_types[id_name] = (
3094
- FileColumnMeaningType.SYSTEM_RECORD_ID
3095
- if id_name == SYSTEM_RECORD_ID
3096
- else FileColumnMeaningType.ENTITY_SYSTEM_RECORD_ID
3097
- )
3239
+ # meaning_types[id_name] = (
3240
+ # FileColumnMeaningType.SYSTEM_RECORD_ID
3241
+ # if id_name == SYSTEM_RECORD_ID
3242
+ # else FileColumnMeaningType.ENTITY_SYSTEM_RECORD_ID
3243
+ # )
3098
3244
  return df
3099
3245
 
3100
3246
  def __correct_target(self, df: pd.DataFrame) -> pd.DataFrame:
@@ -3472,10 +3618,13 @@ class FeaturesEnricher(TransformerMixin):
3472
3618
  for _, key_type in search_keys.items():
3473
3619
  if not isinstance(key_type, SearchKey):
3474
3620
  raise ValidationError(self.bundle.get("unsupported_type_of_search_key").format(key_type))
3621
+
3475
3622
  valid_search_keys = {}
3476
3623
  unsupported_search_keys = {
3477
3624
  SearchKey.IP_RANGE_FROM,
3478
3625
  SearchKey.IP_RANGE_TO,
3626
+ SearchKey.IPV6_RANGE_FROM,
3627
+ SearchKey.IPV6_RANGE_TO,
3479
3628
  SearchKey.MSISDN_RANGE_FROM,
3480
3629
  SearchKey.MSISDN_RANGE_TO,
3481
3630
  # SearchKey.EMAIL_ONE_DOMAIN,
@@ -3565,6 +3714,7 @@ class FeaturesEnricher(TransformerMixin):
3565
3714
  print(msg)
3566
3715
  self.logger.warning(msg)
3567
3716
  self.warning_counter.increment()
3717
+ # TODO maybe raise ValidationError
3568
3718
 
3569
3719
  self.logger.info(f"Prepared search keys: {valid_search_keys}")
3570
3720