upgini 1.1.312a5__py3-none-any.whl → 1.1.313__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

@@ -61,15 +61,11 @@ from upgini.metadata import (
61
61
  SearchKey,
62
62
  )
63
63
  from upgini.metrics import EstimatorWrapper, validate_scoring_argument
64
- from upgini.normalizer.normalize_utils import Normalizer
65
64
  from upgini.resource_bundle import ResourceBundle, bundle, get_custom_bundle
66
65
  from upgini.search_task import SearchTask
67
66
  from upgini.spinner import Spinner
68
67
  from upgini.utils import combine_search_keys, find_numbers_with_decimal_comma
69
- from upgini.utils.country_utils import (
70
- CountrySearchKeyConverter,
71
- CountrySearchKeyDetector,
72
- )
68
+ from upgini.utils.country_utils import CountrySearchKeyDetector
73
69
  from upgini.utils.custom_loss_utils import (
74
70
  get_additional_params_custom_loss,
75
71
  get_runtime_params_custom_loss,
@@ -91,19 +87,11 @@ from upgini.utils.display_utils import (
91
87
  prepare_and_show_report,
92
88
  show_request_quote_button,
93
89
  )
94
- from upgini.utils.email_utils import (
95
- EmailDomainGenerator,
96
- EmailSearchKeyConverter,
97
- EmailSearchKeyDetector,
98
- )
90
+ from upgini.utils.email_utils import EmailSearchKeyConverter, EmailSearchKeyDetector
99
91
  from upgini.utils.features_validator import FeaturesValidator
100
92
  from upgini.utils.format import Format
101
- from upgini.utils.ip_utils import IpSearchKeyConverter
102
- from upgini.utils.phone_utils import PhoneSearchKeyConverter, PhoneSearchKeyDetector
103
- from upgini.utils.postal_code_utils import (
104
- PostalCodeSearchKeyConverter,
105
- PostalCodeSearchKeyDetector,
106
- )
93
+ from upgini.utils.phone_utils import PhoneSearchKeyDetector
94
+ from upgini.utils.postal_code_utils import PostalCodeSearchKeyDetector
107
95
 
108
96
  try:
109
97
  from upgini.utils.progress_bar import CustomProgressBar as ProgressBar
@@ -249,7 +237,6 @@ class FeaturesEnricher(TransformerMixin):
249
237
 
250
238
  self.passed_features: List[str] = []
251
239
  self.df_with_original_index: Optional[pd.DataFrame] = None
252
- self.fit_columns_renaming: Optional[Dict[str, str]] = None
253
240
  self.country_added = False
254
241
  self.fit_generated_features: List[str] = []
255
242
  self.fit_dropped_features: Set[str] = set()
@@ -260,7 +247,7 @@ class FeaturesEnricher(TransformerMixin):
260
247
  self.eval_set: Optional[List[Tuple]] = None
261
248
  self.autodetected_search_keys: Dict[str, SearchKey] = {}
262
249
  self.imbalanced = False
263
- self.__cached_sampled_datasets: Optional[Tuple[pd.DataFrame, pd.DataFrame, pd.Series, Dict, Dict, Dict]] = None
250
+ self.__cached_sampled_datasets: Optional[Tuple[pd.DataFrame, pd.DataFrame, pd.Series, Dict, Dict]] = None
264
251
 
265
252
  validate_version(self.logger)
266
253
  self.search_keys = search_keys or {}
@@ -719,7 +706,7 @@ class FeaturesEnricher(TransformerMixin):
719
706
 
720
707
  start_time = time.time()
721
708
  try:
722
- result, _ = self.__inner_transform(
709
+ result = self.__inner_transform(
723
710
  trace_id,
724
711
  X,
725
712
  exclude_features_sources=exclude_features_sources,
@@ -919,16 +906,8 @@ class FeaturesEnricher(TransformerMixin):
919
906
  search_keys,
920
907
  groups,
921
908
  _cv,
922
- columns_renaming,
923
909
  ) = prepared_data
924
910
 
925
- # rename cat_features
926
- if cat_features:
927
- for new_c, old_c in columns_renaming.items():
928
- if old_c in cat_features:
929
- cat_features.remove(old_c)
930
- cat_features.append(new_c)
931
-
932
911
  gc.collect()
933
912
 
934
913
  print(self.bundle.get("metrics_start"))
@@ -941,7 +920,7 @@ class FeaturesEnricher(TransformerMixin):
941
920
 
942
921
  self._check_train_and_eval_target_distribution(y_sorted, fitting_eval_set_dict)
943
922
 
944
- has_date = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME]) is not None
923
+ has_date = self._get_date_column(search_keys) is not None
945
924
  model_task_type = self.model_task_type or define_task(y_sorted, has_date, self.logger, silent=True)
946
925
 
947
926
  wrapper = EstimatorWrapper.create(
@@ -1034,7 +1013,7 @@ class FeaturesEnricher(TransformerMixin):
1034
1013
  self.bundle.get("quality_metrics_rows_header"): _num_samples(effective_X),
1035
1014
  }
1036
1015
  if model_task_type in [ModelTaskType.BINARY, ModelTaskType.REGRESSION] and is_numeric_dtype(
1037
- effective_y
1016
+ y_sorted
1038
1017
  ):
1039
1018
  train_metrics[self.bundle.get("quality_metrics_mean_target_header")] = round(
1040
1019
  np.mean(effective_y), 4
@@ -1107,7 +1086,7 @@ class FeaturesEnricher(TransformerMixin):
1107
1086
  # self.bundle.get("quality_metrics_match_rate_header"): eval_hit_rate,
1108
1087
  }
1109
1088
  if model_task_type in [ModelTaskType.BINARY, ModelTaskType.REGRESSION] and is_numeric_dtype(
1110
- effective_eval_set[idx][1]
1089
+ eval_y_sorted
1111
1090
  ):
1112
1091
  eval_metrics[self.bundle.get("quality_metrics_mean_target_header")] = round(
1113
1092
  np.mean(effective_eval_set[idx][1]), 4
@@ -1134,7 +1113,7 @@ class FeaturesEnricher(TransformerMixin):
1134
1113
  )
1135
1114
 
1136
1115
  uplift_col = self.bundle.get("quality_metrics_uplift_header")
1137
- date_column = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
1116
+ date_column = self._get_date_column(search_keys)
1138
1117
  if (
1139
1118
  uplift_col in metrics_df.columns
1140
1119
  and (metrics_df[uplift_col] < 0).any()
@@ -1216,6 +1195,27 @@ class FeaturesEnricher(TransformerMixin):
1216
1195
  def _has_paid_features(self, exclude_features_sources: Optional[List[str]]) -> bool:
1217
1196
  return self._has_features_with_commercial_schema(CommercialSchema.PAID.value, exclude_features_sources)
1218
1197
 
1198
+ def _extend_x(self, x: pd.DataFrame, is_demo_dataset: bool) -> Tuple[pd.DataFrame, Dict[str, SearchKey]]:
1199
+ search_keys = self.search_keys.copy()
1200
+ search_keys = self.__prepare_search_keys(x, search_keys, is_demo_dataset, is_transform=True, silent_mode=True)
1201
+
1202
+ extended_X = x.copy()
1203
+ generated_features = []
1204
+ date_column = self._get_date_column(search_keys)
1205
+ if date_column is not None:
1206
+ converter = DateTimeSearchKeyConverter(date_column, self.date_format, self.logger, self.bundle)
1207
+ extended_X = converter.convert(extended_X, keep_time=True)
1208
+ generated_features.extend(converter.generated_features)
1209
+ email_column = self._get_email_column(search_keys)
1210
+ hem_column = self._get_hem_column(search_keys)
1211
+ if email_column:
1212
+ converter = EmailSearchKeyConverter(email_column, hem_column, search_keys, [], self.logger)
1213
+ extended_X = converter.convert(extended_X)
1214
+ generated_features.extend(converter.generated_features)
1215
+ generated_features = [f for f in generated_features if f in self.fit_generated_features]
1216
+
1217
+ return extended_X, search_keys
1218
+
1219
1219
  def _is_input_same_as_fit(
1220
1220
  self,
1221
1221
  X: Union[pd.DataFrame, pd.Series, np.ndarray, None] = None,
@@ -1259,7 +1259,7 @@ class FeaturesEnricher(TransformerMixin):
1259
1259
  groups = None
1260
1260
 
1261
1261
  if not isinstance(_cv, BaseCrossValidator):
1262
- date_column = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
1262
+ date_column = self._get_date_column(search_keys)
1263
1263
  date_series = X[date_column] if date_column is not None else None
1264
1264
  _cv, groups = CVConfig(
1265
1265
  _cv, date_series, self.random_state, self._search_task.get_shuffle_kfold(), group_columns=group_columns
@@ -1282,7 +1282,7 @@ class FeaturesEnricher(TransformerMixin):
1282
1282
 
1283
1283
  def _get_client_cat_features(
1284
1284
  self, estimator: Optional[Any], X: pd.DataFrame, search_keys: Dict[str, SearchKey]
1285
- ) -> Tuple[Optional[List[str]], List[str]]:
1285
+ ) -> Optional[List[str]]:
1286
1286
  cat_features = None
1287
1287
  search_keys_for_metrics = []
1288
1288
  if (
@@ -1342,15 +1342,11 @@ class FeaturesEnricher(TransformerMixin):
1342
1342
  progress_bar,
1343
1343
  progress_callback,
1344
1344
  )
1345
- X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys, columns_renaming = dataclasses.astuple(
1346
- sampled_data
1347
- )
1345
+ X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys = dataclasses.astuple(sampled_data)
1348
1346
 
1349
1347
  excluding_search_keys = list(search_keys.keys())
1350
1348
  if search_keys_for_metrics is not None and len(search_keys_for_metrics) > 0:
1351
- for sk in excluding_search_keys:
1352
- if columns_renaming.get(sk) in search_keys_for_metrics:
1353
- excluding_search_keys.remove(sk)
1349
+ excluding_search_keys = [sk for sk in excluding_search_keys if sk not in search_keys_for_metrics]
1354
1350
 
1355
1351
  client_features = [
1356
1352
  c
@@ -1367,7 +1363,6 @@ class FeaturesEnricher(TransformerMixin):
1367
1363
  importance_threshold,
1368
1364
  max_features,
1369
1365
  )
1370
- filtered_enriched_features = [c for c in filtered_enriched_features if c not in client_features]
1371
1366
 
1372
1367
  X_sorted, y_sorted = self._sort_by_system_record_id(X_sampled, y_sampled, self.cv)
1373
1368
  enriched_X_sorted, enriched_y_sorted = self._sort_by_system_record_id(enriched_X, y_sampled, self.cv)
@@ -1397,7 +1392,6 @@ class FeaturesEnricher(TransformerMixin):
1397
1392
  fitting_X = fitting_X.drop(columns=constant_columns, errors="ignore")
1398
1393
  fitting_enriched_X = fitting_enriched_X.drop(columns=constant_columns, errors="ignore")
1399
1394
 
1400
- # TODO maybe there is no more need for these convertions
1401
1395
  # Remove datetime features
1402
1396
  datetime_features = [
1403
1397
  f for f in fitting_X.columns if is_datetime64_any_dtype(fitting_X[f]) or is_period_dtype(fitting_X[f])
@@ -1485,7 +1479,6 @@ class FeaturesEnricher(TransformerMixin):
1485
1479
  search_keys,
1486
1480
  groups,
1487
1481
  cv,
1488
- columns_renaming,
1489
1482
  )
1490
1483
 
1491
1484
  @dataclass
@@ -1495,7 +1488,6 @@ class FeaturesEnricher(TransformerMixin):
1495
1488
  enriched_X: pd.DataFrame
1496
1489
  eval_set_sampled_dict: Dict[int, Tuple[pd.DataFrame, pd.Series]]
1497
1490
  search_keys: Dict[str, SearchKey]
1498
- columns_renaming: Dict[str, str]
1499
1491
 
1500
1492
  def _sample_data_for_metrics(
1501
1493
  self,
@@ -1535,15 +1527,11 @@ class FeaturesEnricher(TransformerMixin):
1535
1527
  )
1536
1528
 
1537
1529
  def __get_sampled_cached_enriched(self, exclude_features_sources: Optional[List[str]]) -> _SampledDataForMetrics:
1538
- X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys, columns_renaming = (
1539
- self.__cached_sampled_datasets
1540
- )
1530
+ X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys = self.__cached_sampled_datasets
1541
1531
  if exclude_features_sources:
1542
1532
  enriched_X = enriched_X.drop(columns=exclude_features_sources, errors="ignore")
1543
1533
 
1544
- return self.__mk_sampled_data_tuple(
1545
- X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys, columns_renaming
1546
- )
1534
+ return self.__mk_sampled_data_tuple(X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys)
1547
1535
 
1548
1536
  def __sample_only_input(
1549
1537
  self, validated_X: pd.DataFrame, validated_y: pd.Series, eval_set: Optional[List[tuple]], is_demo_dataset: bool
@@ -1561,28 +1549,6 @@ class FeaturesEnricher(TransformerMixin):
1561
1549
  eval_xy[EVAL_SET_INDEX] = idx + 1
1562
1550
  df = pd.concat([df, eval_xy])
1563
1551
 
1564
- search_keys = self.search_keys.copy()
1565
- search_keys = self.__prepare_search_keys(df, search_keys, is_demo_dataset, is_transform=True, silent_mode=True)
1566
-
1567
- date_column = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
1568
- generated_features = []
1569
- if date_column is not None:
1570
- converter = DateTimeSearchKeyConverter(
1571
- date_column, self.date_format, self.logger, self.bundle, silent_mode=True
1572
- )
1573
- df = converter.convert(df, keep_time=True)
1574
- generated_features = converter.generated_features
1575
-
1576
- email_columns = SearchKey.find_all_keys(search_keys, SearchKey.EMAIL)
1577
- if email_columns:
1578
- generator = EmailDomainGenerator(email_columns)
1579
- df = generator.generate(df)
1580
- generated_features.extend(generator.generated_features)
1581
-
1582
- normalizer = Normalizer(self.search_keys, generated_features, self.bundle, self.logger, self.warning_counter)
1583
- df = normalizer.normalize(df)
1584
- columns_renaming = normalizer.columns_renaming
1585
-
1586
1552
  df = clean_full_duplicates(df, logger=self.logger, silent=True, bundle=self.bundle)
1587
1553
 
1588
1554
  num_samples = _num_samples(df)
@@ -1595,34 +1561,24 @@ class FeaturesEnricher(TransformerMixin):
1595
1561
  self.logger.info(f"Downsampling from {num_samples} to {sample_rows}")
1596
1562
  df = df.sample(n=sample_rows, random_state=self.random_state)
1597
1563
 
1598
- df = self.__add_fit_system_record_id(df, search_keys, SYSTEM_RECORD_ID)
1599
- if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
1600
- df = df.drop(columns=DateTimeSearchKeyConverter.DATETIME_COL)
1564
+ df_extended, search_keys = self._extend_x(df, is_demo_dataset)
1565
+ df_extended = self.__add_fit_system_record_id(df_extended, {}, search_keys, SYSTEM_RECORD_ID)
1601
1566
 
1602
- train_df = df.query(f"{EVAL_SET_INDEX} == 0") if eval_set is not None else df
1567
+ train_df = df_extended.query(f"{EVAL_SET_INDEX} == 0") if eval_set is not None else df_extended
1603
1568
  X_sampled = train_df.drop(columns=[TARGET, EVAL_SET_INDEX], errors="ignore")
1604
1569
  y_sampled = train_df[TARGET].copy()
1605
1570
  enriched_X = X_sampled
1606
1571
 
1607
1572
  if eval_set is not None:
1608
1573
  for idx in range(len(eval_set)):
1609
- eval_xy_sampled = df.query(f"{EVAL_SET_INDEX} == {idx + 1}")
1574
+ eval_xy_sampled = df_extended.query(f"{EVAL_SET_INDEX} == {idx + 1}")
1610
1575
  eval_X_sampled = eval_xy_sampled.drop(columns=[TARGET, EVAL_SET_INDEX], errors="ignore")
1611
1576
  eval_y_sampled = eval_xy_sampled[TARGET].copy()
1612
1577
  enriched_eval_X = eval_X_sampled
1613
1578
  eval_set_sampled_dict[idx] = (eval_X_sampled, enriched_eval_X, eval_y_sampled)
1614
- self.__cached_sampled_datasets = (
1615
- X_sampled,
1616
- y_sampled,
1617
- enriched_X,
1618
- eval_set_sampled_dict,
1619
- search_keys,
1620
- columns_renaming,
1621
- )
1579
+ self.__cached_sampled_datasets = (X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys)
1622
1580
 
1623
- return self.__mk_sampled_data_tuple(
1624
- X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys, columns_renaming
1625
- )
1581
+ return self.__mk_sampled_data_tuple(X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys)
1626
1582
 
1627
1583
  def __sample_balanced(
1628
1584
  self,
@@ -1634,7 +1590,7 @@ class FeaturesEnricher(TransformerMixin):
1634
1590
  search_keys = self.fit_search_keys
1635
1591
 
1636
1592
  rows_to_drop = None
1637
- has_date = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME]) is not None
1593
+ has_date = self._get_date_column(search_keys) is not None
1638
1594
  task_type = self.model_task_type or define_task(
1639
1595
  self.df_with_original_index[TARGET], has_date, self.logger, silent=True
1640
1596
  )
@@ -1688,18 +1644,9 @@ class FeaturesEnricher(TransformerMixin):
1688
1644
  enriched_eval_X = enriched_eval_sets[idx + 1].drop(columns=[TARGET, EVAL_SET_INDEX])
1689
1645
  eval_set_sampled_dict[idx] = (eval_X_sampled, enriched_eval_X, eval_y_sampled)
1690
1646
 
1691
- self.__cached_sampled_datasets = (
1692
- X_sampled,
1693
- y_sampled,
1694
- enriched_X,
1695
- eval_set_sampled_dict,
1696
- search_keys,
1697
- self.fit_columns_renaming,
1698
- )
1647
+ self.__cached_sampled_datasets = (X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys)
1699
1648
 
1700
- return self.__mk_sampled_data_tuple(
1701
- X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys, self.fit_columns_renaming
1702
- )
1649
+ return self.__mk_sampled_data_tuple(X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys)
1703
1650
 
1704
1651
  def __sample_imbalanced(
1705
1652
  self,
@@ -1739,7 +1686,7 @@ class FeaturesEnricher(TransformerMixin):
1739
1686
  tmp_target_name = "__target"
1740
1687
  df = df.rename(columns={TARGET: tmp_target_name})
1741
1688
 
1742
- enriched_df, columns_renaming = self.__inner_transform(
1689
+ enriched_df = self.__inner_transform(
1743
1690
  trace_id,
1744
1691
  df,
1745
1692
  exclude_features_sources=exclude_features_sources,
@@ -1787,7 +1734,7 @@ class FeaturesEnricher(TransformerMixin):
1787
1734
  tmp_target_name = "__target"
1788
1735
  df = df.rename(columns={TARGET: tmp_target_name})
1789
1736
 
1790
- enriched_Xy, columns_renaming = self.__inner_transform(
1737
+ enriched_Xy = self.__inner_transform(
1791
1738
  trace_id,
1792
1739
  df,
1793
1740
  exclude_features_sources=exclude_features_sources,
@@ -1812,18 +1759,9 @@ class FeaturesEnricher(TransformerMixin):
1812
1759
  y_sampled = enriched_Xy[TARGET].copy()
1813
1760
  enriched_X = enriched_Xy.drop(columns=TARGET)
1814
1761
 
1815
- self.__cached_sampled_datasets = (
1816
- X_sampled,
1817
- y_sampled,
1818
- enriched_X,
1819
- eval_set_sampled_dict,
1820
- self.search_keys,
1821
- columns_renaming,
1822
- )
1762
+ self.__cached_sampled_datasets = (X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, self.search_keys)
1823
1763
 
1824
- return self.__mk_sampled_data_tuple(
1825
- X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, self.search_keys, columns_renaming
1826
- )
1764
+ return self.__mk_sampled_data_tuple(X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, self.search_keys)
1827
1765
 
1828
1766
  def __mk_sampled_data_tuple(
1829
1767
  self,
@@ -1832,7 +1770,6 @@ class FeaturesEnricher(TransformerMixin):
1832
1770
  enriched_X: pd.DataFrame,
1833
1771
  eval_set_sampled_dict: Dict,
1834
1772
  search_keys: Dict,
1835
- columns_renaming: Dict[str, str],
1836
1773
  ):
1837
1774
  search_keys = {k: v for k, v in search_keys.items() if k in X_sampled.columns.to_list()}
1838
1775
  return FeaturesEnricher._SampledDataForMetrics(
@@ -1841,7 +1778,6 @@ class FeaturesEnricher(TransformerMixin):
1841
1778
  enriched_X=enriched_X,
1842
1779
  eval_set_sampled_dict=eval_set_sampled_dict,
1843
1780
  search_keys=search_keys,
1844
- columns_renaming=columns_renaming,
1845
1781
  )
1846
1782
 
1847
1783
  def get_search_id(self) -> Optional[str]:
@@ -1930,7 +1866,7 @@ class FeaturesEnricher(TransformerMixin):
1930
1866
  progress_bar: Optional[ProgressBar] = None,
1931
1867
  progress_callback: Optional[Callable[[SearchProgress], Any]] = None,
1932
1868
  add_fit_system_record_id: bool = False,
1933
- ) -> Tuple[pd.DataFrame, Dict[str, str]]:
1869
+ ) -> pd.DataFrame:
1934
1870
  if self._search_task is None:
1935
1871
  raise NotFittedError(self.bundle.get("transform_unfitted_enricher"))
1936
1872
 
@@ -1943,13 +1879,13 @@ class FeaturesEnricher(TransformerMixin):
1943
1879
 
1944
1880
  if len(self.feature_names_) == 0:
1945
1881
  self.logger.warning(self.bundle.get("no_important_features_for_transform"))
1946
- return X, {c: c for c in X.columns}
1882
+ return X
1947
1883
 
1948
1884
  if self._has_paid_features(exclude_features_sources):
1949
1885
  msg = self.bundle.get("transform_with_paid_features")
1950
1886
  self.logger.warning(msg)
1951
1887
  self.__display_support_link(msg)
1952
- return None, {c: c for c in X.columns}
1888
+ return None
1953
1889
 
1954
1890
  if not metrics_calculation:
1955
1891
  transform_usage = self.rest_client.get_current_transform_usage(trace_id)
@@ -1960,7 +1896,7 @@ class FeaturesEnricher(TransformerMixin):
1960
1896
  self.logger.warning(msg)
1961
1897
  print(msg)
1962
1898
  show_request_quote_button()
1963
- return None, {c: c for c in X.columns}
1899
+ return None
1964
1900
  else:
1965
1901
  msg = self.bundle.get("transform_usage_info").format(
1966
1902
  transform_usage.limit, transform_usage.transformed_rows
@@ -1998,11 +1934,9 @@ class FeaturesEnricher(TransformerMixin):
1998
1934
  df = self.__add_country_code(df, search_keys)
1999
1935
 
2000
1936
  generated_features = []
2001
- date_column = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
1937
+ date_column = self._get_date_column(search_keys)
2002
1938
  if date_column is not None:
2003
- converter = DateTimeSearchKeyConverter(
2004
- date_column, self.date_format, self.logger, bundle=self.bundle, silent_mode=silent_mode
2005
- )
1939
+ converter = DateTimeSearchKeyConverter(date_column, self.date_format, self.logger, bundle=self.bundle)
2006
1940
  df = converter.convert(df)
2007
1941
  self.logger.info(f"Date column after convertion: {df[date_column]}")
2008
1942
  generated_features.extend(converter.generated_features)
@@ -2011,93 +1945,61 @@ class FeaturesEnricher(TransformerMixin):
2011
1945
  if self.add_date_if_missing:
2012
1946
  df = self._add_current_date_as_key(df, search_keys, self.logger, self.bundle)
2013
1947
 
2014
- email_columns = SearchKey.find_all_keys(search_keys, SearchKey.EMAIL)
2015
- if email_columns:
2016
- generator = EmailDomainGenerator(email_columns)
2017
- df = generator.generate(df)
2018
- generated_features.extend(generator.generated_features)
2019
-
2020
- normalizer = Normalizer(
2021
- search_keys, generated_features, self.bundle, self.logger, self.warning_counter, silent_mode
2022
- )
2023
- df = normalizer.normalize(df)
2024
- columns_renaming = normalizer.columns_renaming
2025
-
2026
1948
  # Don't pass all features in backend on transform
1949
+ original_features_for_transform = []
2027
1950
  runtime_parameters = self._get_copy_of_runtime_parameters()
2028
- features_for_transform = self._search_task.get_features_for_transform() or []
2029
- if len(features_for_transform) > 0:
2030
- runtime_parameters.properties["features_for_embeddings"] = ",".join(features_for_transform)
1951
+ features_not_to_pass = [column for column in df.columns if column not in search_keys.keys()]
1952
+ if len(features_not_to_pass) > 0:
1953
+ # Pass only features that need for transform
1954
+ features_for_transform = self._search_task.get_features_for_transform()
1955
+ if features_for_transform is not None and len(features_for_transform) > 0:
1956
+ file_metadata = self._search_task.get_file_metadata(trace_id)
1957
+ original_features_for_transform = [
1958
+ c.originalName or c.name for c in file_metadata.columns if c.name in features_for_transform
1959
+ ]
1960
+
1961
+ runtime_parameters.properties["features_for_embeddings"] = ",".join(features_for_transform)
2031
1962
 
2032
- columns_for_system_record_id = sorted(list(search_keys.keys()) + features_for_transform)
1963
+ columns_for_system_record_id = sorted(list(search_keys.keys()) + (original_features_for_transform))
2033
1964
 
2034
1965
  df[ENTITY_SYSTEM_RECORD_ID] = pd.util.hash_pandas_object(
2035
1966
  df[columns_for_system_record_id], index=False
2036
1967
  ).astype("Float64")
2037
1968
 
2038
1969
  # Explode multiple search keys
2039
- df, unnest_search_keys = self._explode_multiple_search_keys(df, search_keys, columns_renaming)
1970
+ df, unnest_search_keys = self._explode_multiple_search_keys(df, search_keys)
2040
1971
 
2041
1972
  email_column = self._get_email_column(search_keys)
2042
1973
  hem_column = self._get_hem_column(search_keys)
1974
+ email_converted_to_hem = False
2043
1975
  if email_column:
2044
1976
  converter = EmailSearchKeyConverter(
2045
- email_column,
2046
- hem_column,
2047
- search_keys,
2048
- columns_renaming,
2049
- list(unnest_search_keys.keys()),
2050
- self.logger,
1977
+ email_column, hem_column, search_keys, list(unnest_search_keys.keys()), self.logger
2051
1978
  )
2052
1979
  df = converter.convert(df)
2053
-
2054
- ip_column = self._get_ip_column(search_keys)
2055
- if ip_column:
2056
- converter = IpSearchKeyConverter(
2057
- ip_column,
2058
- search_keys,
2059
- columns_renaming,
2060
- list(unnest_search_keys.keys()),
2061
- self.bundle,
2062
- self.logger,
2063
- )
2064
- df = converter.convert(df)
2065
-
2066
- phone_column = self._get_phone_column(search_keys)
2067
- country_column = self._get_country_column(search_keys)
2068
- if phone_column:
2069
- converter = PhoneSearchKeyConverter(phone_column, country_column)
2070
- df = converter.convert(df)
2071
-
2072
- if country_column:
2073
- converter = CountrySearchKeyConverter(country_column)
2074
- df = converter.convert(df)
2075
-
2076
- postal_code = self._get_postal_column(search_keys)
2077
- if postal_code:
2078
- converter = PostalCodeSearchKeyConverter(postal_code)
2079
- df = converter.convert(df)
2080
-
1980
+ generated_features.extend(converter.generated_features)
1981
+ email_converted_to_hem = converter.email_converted_to_hem
2081
1982
  generated_features = [f for f in generated_features if f in self.fit_generated_features]
2082
1983
 
2083
1984
  meaning_types = {col: key.value for col, key in search_keys.items()}
2084
- for col in features_for_transform:
1985
+ # non_keys_columns = [column for column in df.columns if column not in search_keys.keys()]
1986
+ for col in original_features_for_transform:
2085
1987
  meaning_types[col] = FileColumnMeaningType.FEATURE
2086
- features_not_to_pass = [
2087
- c
2088
- for c in df.columns
2089
- if c not in search_keys.keys() and c not in features_for_transform and c != ENTITY_SYSTEM_RECORD_ID
2090
- ]
1988
+ features_not_to_pass = [column for column in features_not_to_pass if column not in search_keys.keys()]
1989
+
1990
+ if email_converted_to_hem:
1991
+ features_not_to_pass.append(email_column)
1992
+
1993
+ features_not_to_pass = [c for c in features_not_to_pass if c not in original_features_for_transform]
1994
+ columns_for_system_record_id = sorted(list(search_keys.keys()) + (original_features_for_transform))
2091
1995
 
2092
1996
  if add_fit_system_record_id:
2093
- df = self.__add_fit_system_record_id(df, search_keys, SYSTEM_RECORD_ID)
2094
- if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
2095
- df = df.drop(columns=DateTimeSearchKeyConverter.DATETIME_COL)
1997
+ df = self.__add_fit_system_record_id(df, {}, search_keys, SYSTEM_RECORD_ID)
2096
1998
  df = df.rename(columns={SYSTEM_RECORD_ID: SORT_ID})
2097
1999
  features_not_to_pass.append(SORT_ID)
2098
2000
 
2099
- # search keys might be changed after explode
2100
- columns_for_system_record_id = sorted(list(search_keys.keys()) + features_for_transform)
2001
+ columns_for_system_record_id = sorted(list(search_keys.keys()) + (original_features_for_transform or []))
2002
+
2101
2003
  df[SYSTEM_RECORD_ID] = pd.util.hash_pandas_object(df[columns_for_system_record_id], index=False).astype(
2102
2004
  "Float64"
2103
2005
  )
@@ -2133,7 +2035,8 @@ class FeaturesEnricher(TransformerMixin):
2133
2035
  rest_client=self.rest_client,
2134
2036
  logger=self.logger,
2135
2037
  )
2136
- dataset.columns_renaming = columns_renaming
2038
+ if email_converted_to_hem:
2039
+ dataset.ignore_columns = [email_column]
2137
2040
 
2138
2041
  if max_features is not None or importance_threshold is not None:
2139
2042
  exclude_features_sources = list(
@@ -2222,9 +2125,7 @@ class FeaturesEnricher(TransformerMixin):
2222
2125
  result = enrich()
2223
2126
 
2224
2127
  filtered_columns = self.__filtered_enriched_features(importance_threshold, max_features)
2225
- existing_filtered_columns = [
2226
- c for c in filtered_columns if c in result.columns and c not in validated_X.columns
2227
- ]
2128
+ existing_filtered_columns = [c for c in filtered_columns if c in result.columns]
2228
2129
  selecting_columns = validated_X.columns.tolist() + generated_features + existing_filtered_columns
2229
2130
  if add_fit_system_record_id:
2230
2131
  selecting_columns.append(SORT_ID)
@@ -2237,7 +2138,7 @@ class FeaturesEnricher(TransformerMixin):
2237
2138
  if add_fit_system_record_id:
2238
2139
  result = result.rename(columns={SORT_ID: SYSTEM_RECORD_ID})
2239
2140
 
2240
- return result, columns_renaming
2141
+ return result
2241
2142
 
2242
2143
  def _get_excluded_features(self, max_features: Optional[int], importance_threshold: Optional[float]) -> List[str]:
2243
2144
  features_info = self._internal_features_info
@@ -2338,9 +2239,6 @@ class FeaturesEnricher(TransformerMixin):
2338
2239
  self.df_with_original_index = None
2339
2240
  self.__cached_sampled_datasets = None
2340
2241
  self.metrics = None
2341
- self.fit_columns_renaming = None
2342
- self.fit_dropped_features = set()
2343
- self.fit_generated_features = []
2344
2242
 
2345
2243
  validated_X = self._validate_X(X)
2346
2244
  validated_y = self._validate_y(validated_X, y)
@@ -2387,10 +2285,9 @@ class FeaturesEnricher(TransformerMixin):
2387
2285
  self.fit_search_keys = self.search_keys.copy()
2388
2286
  self.fit_search_keys = self.__prepare_search_keys(validated_X, self.fit_search_keys, is_demo_dataset)
2389
2287
 
2390
- maybe_date_column = SearchKey.find_key(self.fit_search_keys, [SearchKey.DATE, SearchKey.DATETIME])
2288
+ maybe_date_column = self._get_date_column(self.fit_search_keys)
2391
2289
  has_date = maybe_date_column is not None
2392
2290
  model_task_type = self.model_task_type or define_task(validated_y, has_date, self.logger)
2393
-
2394
2291
  self._validate_binary_observations(validated_y, model_task_type)
2395
2292
 
2396
2293
  self.runtime_parameters = get_runtime_params_custom_loss(
@@ -2420,13 +2317,7 @@ class FeaturesEnricher(TransformerMixin):
2420
2317
  self.fit_generated_features = []
2421
2318
 
2422
2319
  if has_date:
2423
- converter = DateTimeSearchKeyConverter(
2424
- maybe_date_column,
2425
- self.date_format,
2426
- self.logger,
2427
- bundle=self.bundle,
2428
- warnings_counter=self.warning_counter,
2429
- )
2320
+ converter = DateTimeSearchKeyConverter(maybe_date_column, self.date_format, self.logger, bundle=self.bundle)
2430
2321
  df = converter.convert(df, keep_time=True)
2431
2322
  self.logger.info(f"Date column after convertion: {df[maybe_date_column]}")
2432
2323
  self.fit_generated_features.extend(converter.generated_features)
@@ -2435,12 +2326,6 @@ class FeaturesEnricher(TransformerMixin):
2435
2326
  if self.add_date_if_missing:
2436
2327
  df = self._add_current_date_as_key(df, self.fit_search_keys, self.logger, self.bundle)
2437
2328
 
2438
- email_columns = SearchKey.find_all_keys(self.fit_search_keys, SearchKey.EMAIL)
2439
- if email_columns:
2440
- generator = EmailDomainGenerator(email_columns)
2441
- df = generator.generate(df)
2442
- self.fit_generated_features.extend(generator.generated_features)
2443
-
2444
2329
  # Checks that need validated date
2445
2330
  validate_dates_distribution(df, self.fit_search_keys, self.logger, self.bundle, self.warning_counter)
2446
2331
 
@@ -2449,12 +2334,7 @@ class FeaturesEnricher(TransformerMixin):
2449
2334
 
2450
2335
  self.__adjust_cv(df, maybe_date_column, model_task_type)
2451
2336
 
2452
- normalizer = Normalizer(
2453
- self.fit_search_keys, self.fit_generated_features, self.bundle, self.logger, self.warning_counter
2454
- )
2455
- df = normalizer.normalize(df)
2456
- columns_renaming = normalizer.columns_renaming
2457
- self.fit_columns_renaming = columns_renaming
2337
+ # TODO normalize and convert all columns
2458
2338
 
2459
2339
  df = remove_fintech_duplicates(
2460
2340
  df, self.fit_search_keys, date_format=self.date_format, logger=self.logger, bundle=self.bundle
@@ -2462,58 +2342,38 @@ class FeaturesEnricher(TransformerMixin):
2462
2342
  df = clean_full_duplicates(df, self.logger, bundle=self.bundle)
2463
2343
 
2464
2344
  # Explode multiple search keys
2465
- df = self.__add_fit_system_record_id(df, self.fit_search_keys, ENTITY_SYSTEM_RECORD_ID)
2345
+ non_feature_columns = [self.TARGET_NAME, EVAL_SET_INDEX] + list(self.fit_search_keys.keys())
2346
+ meaning_types = {
2347
+ **{col: key.value for col, key in self.fit_search_keys.items()},
2348
+ **{str(c): FileColumnMeaningType.FEATURE for c in df.columns if c not in non_feature_columns},
2349
+ }
2350
+ meaning_types[self.TARGET_NAME] = FileColumnMeaningType.TARGET
2351
+ if eval_set is not None and len(eval_set) > 0:
2352
+ meaning_types[EVAL_SET_INDEX] = FileColumnMeaningType.EVAL_SET_INDEX
2353
+ df = self.__add_fit_system_record_id(df, meaning_types, self.fit_search_keys, ENTITY_SYSTEM_RECORD_ID)
2466
2354
 
2467
2355
  # TODO check that this is correct for enrichment
2468
2356
  self.df_with_original_index = df.copy()
2469
- # TODO check maybe need to drop _time column from df_with_original_index
2470
2357
 
2471
- df, unnest_search_keys = self._explode_multiple_search_keys(df, self.fit_search_keys, columns_renaming)
2358
+ df, unnest_search_keys = self._explode_multiple_search_keys(df, self.fit_search_keys)
2472
2359
 
2473
2360
  # Convert EMAIL to HEM after unnesting to do it only with one column
2474
2361
  email_column = self._get_email_column(self.fit_search_keys)
2475
2362
  hem_column = self._get_hem_column(self.fit_search_keys)
2363
+ email_converted_to_hem = False
2476
2364
  if email_column:
2477
2365
  converter = EmailSearchKeyConverter(
2478
- email_column,
2479
- hem_column,
2480
- self.fit_search_keys,
2481
- columns_renaming,
2482
- list(unnest_search_keys.keys()),
2483
- self.logger,
2366
+ email_column, hem_column, self.fit_search_keys, list(unnest_search_keys.keys()), self.logger
2484
2367
  )
2485
2368
  df = converter.convert(df)
2486
-
2487
- ip_column = self._get_ip_column(self.fit_search_keys)
2488
- if ip_column:
2489
- converter = IpSearchKeyConverter(
2490
- ip_column,
2491
- self.fit_search_keys,
2492
- columns_renaming,
2493
- list(unnest_search_keys.keys()),
2494
- self.bundle,
2495
- self.logger,
2496
- )
2497
- df = converter.convert(df)
2498
-
2499
- phone_column = self._get_phone_column(self.fit_search_keys)
2500
- country_column = self._get_country_column(self.fit_search_keys)
2501
- if phone_column:
2502
- converter = PhoneSearchKeyConverter(phone_column, country_column)
2503
- df = converter.convert(df)
2504
-
2505
- if country_column:
2506
- converter = CountrySearchKeyConverter(country_column)
2507
- df = converter.convert(df)
2508
-
2509
- postal_code = self._get_postal_column(self.fit_search_keys)
2510
- if postal_code:
2511
- converter = PostalCodeSearchKeyConverter(postal_code)
2512
- df = converter.convert(df)
2369
+ self.fit_generated_features.extend(converter.generated_features)
2370
+ email_converted_to_hem = converter.email_converted_to_hem
2513
2371
 
2514
2372
  non_feature_columns = [self.TARGET_NAME, EVAL_SET_INDEX, ENTITY_SYSTEM_RECORD_ID, SEARCH_KEY_UNNEST] + list(
2515
2373
  self.fit_search_keys.keys()
2516
2374
  )
2375
+ if email_converted_to_hem:
2376
+ non_feature_columns.append(email_column)
2517
2377
  if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
2518
2378
  non_feature_columns.append(DateTimeSearchKeyConverter.DATETIME_COL)
2519
2379
 
@@ -2525,6 +2385,9 @@ class FeaturesEnricher(TransformerMixin):
2525
2385
  self.fit_dropped_features.update(features_to_drop)
2526
2386
  df = df.drop(columns=features_to_drop)
2527
2387
 
2388
+ if email_converted_to_hem:
2389
+ self.fit_dropped_features.add(email_column)
2390
+
2528
2391
  self.fit_generated_features = [f for f in self.fit_generated_features if f not in self.fit_dropped_features]
2529
2392
 
2530
2393
  meaning_types = {
@@ -2538,12 +2401,7 @@ class FeaturesEnricher(TransformerMixin):
2538
2401
  if eval_set is not None and len(eval_set) > 0:
2539
2402
  meaning_types[EVAL_SET_INDEX] = FileColumnMeaningType.EVAL_SET_INDEX
2540
2403
 
2541
- df = self.__add_fit_system_record_id(df, self.fit_search_keys, SYSTEM_RECORD_ID)
2542
-
2543
- if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
2544
- df = df.drop(columns=DateTimeSearchKeyConverter.DATETIME_COL)
2545
-
2546
- meaning_types[SYSTEM_RECORD_ID] = FileColumnMeaningType.SYSTEM_RECORD_ID
2404
+ df = self.__add_fit_system_record_id(df, meaning_types, self.fit_search_keys, SYSTEM_RECORD_ID)
2547
2405
 
2548
2406
  df = df.reset_index(drop=True).sort_values(by=SYSTEM_RECORD_ID).reset_index(drop=True)
2549
2407
 
@@ -2561,7 +2419,8 @@ class FeaturesEnricher(TransformerMixin):
2561
2419
  rest_client=self.rest_client,
2562
2420
  logger=self.logger,
2563
2421
  )
2564
- dataset.columns_renaming = columns_renaming
2422
+ if email_converted_to_hem:
2423
+ dataset.ignore_columns = [email_column]
2565
2424
 
2566
2425
  self.passed_features = [
2567
2426
  column for column, meaning_type in meaning_types.items() if meaning_type == FileColumnMeaningType.FEATURE
@@ -2950,7 +2809,7 @@ class FeaturesEnricher(TransformerMixin):
2950
2809
  if DateTimeSearchKeyConverter.DATETIME_COL in X.columns:
2951
2810
  date_column = DateTimeSearchKeyConverter.DATETIME_COL
2952
2811
  else:
2953
- date_column = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
2812
+ date_column = FeaturesEnricher._get_date_column(search_keys)
2954
2813
  sort_columns = [date_column] if date_column is not None else []
2955
2814
 
2956
2815
  # Xy = pd.concat([X, y], axis=1)
@@ -3046,10 +2905,10 @@ class FeaturesEnricher(TransformerMixin):
3046
2905
 
3047
2906
  do_without_pandas_limits(print_datasets_sample)
3048
2907
 
3049
- maybe_date_col = SearchKey.find_key(self.search_keys, [SearchKey.DATE, SearchKey.DATETIME])
2908
+ maybe_date_col = self._get_date_column(self.search_keys)
3050
2909
  if X is not None and maybe_date_col is not None and maybe_date_col in X.columns:
3051
2910
  # TODO cast date column to single dtype
3052
- date_converter = DateTimeSearchKeyConverter(maybe_date_col, self.date_format, silent_mode=True)
2911
+ date_converter = DateTimeSearchKeyConverter(maybe_date_col, self.date_format)
3053
2912
  converted_X = date_converter.convert(X)
3054
2913
  min_date = converted_X[maybe_date_col].min()
3055
2914
  max_date = converted_X[maybe_date_col].max()
@@ -3076,6 +2935,12 @@ class FeaturesEnricher(TransformerMixin):
3076
2935
 
3077
2936
  return df
3078
2937
 
2938
+ @staticmethod
2939
+ def _get_date_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
2940
+ for col, t in search_keys.items():
2941
+ if t in [SearchKey.DATE, SearchKey.DATETIME]:
2942
+ return col
2943
+
3079
2944
  @staticmethod
3080
2945
  def _add_current_date_as_key(
3081
2946
  df: pd.DataFrame, search_keys: Dict[str, SearchKey], logger: logging.Logger, bundle: ResourceBundle
@@ -3091,7 +2956,7 @@ class FeaturesEnricher(TransformerMixin):
3091
2956
  logger.warning(msg)
3092
2957
  df[FeaturesEnricher.CURRENT_DATE] = datetime.date.today()
3093
2958
  search_keys[FeaturesEnricher.CURRENT_DATE] = SearchKey.DATE
3094
- converter = DateTimeSearchKeyConverter(FeaturesEnricher.CURRENT_DATE, silent_mode=True)
2959
+ converter = DateTimeSearchKeyConverter(FeaturesEnricher.CURRENT_DATE, None, logger, bundle)
3095
2960
  df = converter.convert(df)
3096
2961
  return df
3097
2962
 
@@ -3119,37 +2984,17 @@ class FeaturesEnricher(TransformerMixin):
3119
2984
  if len(cols) == 1:
3120
2985
  return cols[0]
3121
2986
 
3122
- @staticmethod
3123
- def _get_ip_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
3124
- cols = [col for col, t in search_keys.items() if t == SearchKey.IP]
3125
- if len(cols) > 1:
3126
- raise Exception("More than one ip column found after unnest")
3127
- if len(cols) == 1:
3128
- return cols[0]
3129
-
3130
2987
  @staticmethod
3131
2988
  def _get_phone_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
3132
2989
  for col, t in search_keys.items():
3133
2990
  if t == SearchKey.PHONE:
3134
2991
  return col
3135
2992
 
3136
- @staticmethod
3137
- def _get_country_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
3138
- for col, t in search_keys.items():
3139
- if t == SearchKey.COUNTRY:
3140
- return col
3141
-
3142
- @staticmethod
3143
- def _get_postal_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
3144
- for col, t in search_keys.items():
3145
- if t == SearchKey.POSTAL_CODE:
3146
- return col
3147
-
3148
2993
  def _explode_multiple_search_keys(
3149
- self, df: pd.DataFrame, search_keys: Dict[str, SearchKey], columns_renaming: Dict[str, str]
2994
+ self, df: pd.DataFrame, search_keys: Dict[str, SearchKey]
3150
2995
  ) -> Tuple[pd.DataFrame, Dict[str, List[str]]]:
3151
2996
  # find groups of multiple search keys
3152
- search_key_names_by_type: Dict[SearchKey, List[str]] = {}
2997
+ search_key_names_by_type: Dict[SearchKey, str] = {}
3153
2998
  for key_name, key_type in search_keys.items():
3154
2999
  search_key_names_by_type[key_type] = search_key_names_by_type.get(key_type, []) + [key_name]
3155
3000
  search_key_names_by_type = {
@@ -3173,7 +3018,6 @@ class FeaturesEnricher(TransformerMixin):
3173
3018
  del search_keys[old_key]
3174
3019
  search_keys[new_search_key] = key_type
3175
3020
  unnest_search_keys[new_search_key] = key_names
3176
- columns_renaming[new_search_key] = new_search_key
3177
3021
 
3178
3022
  df = pd.concat(exploded_dfs, ignore_index=True)
3179
3023
  return df, unnest_search_keys
@@ -3181,7 +3025,7 @@ class FeaturesEnricher(TransformerMixin):
3181
3025
  def __add_fit_system_record_id(
3182
3026
  self,
3183
3027
  df: pd.DataFrame,
3184
- # meaning_types: Dict[str, FileColumnMeaningType],
3028
+ meaning_types: Dict[str, FileColumnMeaningType],
3185
3029
  search_keys: Dict[str, SearchKey],
3186
3030
  id_name: str,
3187
3031
  ) -> pd.DataFrame:
@@ -3204,9 +3048,9 @@ class FeaturesEnricher(TransformerMixin):
3204
3048
  ]
3205
3049
  if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
3206
3050
  date_column = DateTimeSearchKeyConverter.DATETIME_COL
3207
- sort_exclude_columns.append(SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME]))
3051
+ sort_exclude_columns.append(self._get_date_column(search_keys))
3208
3052
  else:
3209
- date_column = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
3053
+ date_column = self._get_date_column(search_keys)
3210
3054
  sort_columns = [date_column] if date_column is not None else []
3211
3055
 
3212
3056
  other_columns = sorted(
@@ -3215,6 +3059,13 @@ class FeaturesEnricher(TransformerMixin):
3215
3059
  for c in df.columns
3216
3060
  if c not in sort_columns and c not in sort_exclude_columns and df[c].nunique() > 1
3217
3061
  ]
3062
+ # [
3063
+ # sk
3064
+ # for sk, key_type in search_keys.items()
3065
+ # if key_type not in [SearchKey.DATE, SearchKey.DATETIME]
3066
+ # and sk in df.columns
3067
+ # and df[sk].nunique() > 1 # don't use constant keys for hash
3068
+ # ]
3218
3069
  )
3219
3070
 
3220
3071
  search_keys_hash = "search_keys_hash"
@@ -3227,6 +3078,9 @@ class FeaturesEnricher(TransformerMixin):
3227
3078
  if search_keys_hash in df.columns:
3228
3079
  df.drop(columns=search_keys_hash, inplace=True)
3229
3080
 
3081
+ if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
3082
+ df.drop(columns=DateTimeSearchKeyConverter.DATETIME_COL, inplace=True)
3083
+
3230
3084
  df = df.reset_index(drop=True).reset_index()
3231
3085
  # system_record_id saves correct order for fit
3232
3086
  df = df.rename(columns={DEFAULT_INDEX: id_name})
@@ -3236,11 +3090,11 @@ class FeaturesEnricher(TransformerMixin):
3236
3090
  df.index.name = original_index_name
3237
3091
  df = df.sort_values(by=original_order_name).drop(columns=original_order_name)
3238
3092
 
3239
- # meaning_types[id_name] = (
3240
- # FileColumnMeaningType.SYSTEM_RECORD_ID
3241
- # if id_name == SYSTEM_RECORD_ID
3242
- # else FileColumnMeaningType.ENTITY_SYSTEM_RECORD_ID
3243
- # )
3093
+ meaning_types[id_name] = (
3094
+ FileColumnMeaningType.SYSTEM_RECORD_ID
3095
+ if id_name == SYSTEM_RECORD_ID
3096
+ else FileColumnMeaningType.ENTITY_SYSTEM_RECORD_ID
3097
+ )
3244
3098
  return df
3245
3099
 
3246
3100
  def __correct_target(self, df: pd.DataFrame) -> pd.DataFrame:
@@ -3618,13 +3472,10 @@ class FeaturesEnricher(TransformerMixin):
3618
3472
  for _, key_type in search_keys.items():
3619
3473
  if not isinstance(key_type, SearchKey):
3620
3474
  raise ValidationError(self.bundle.get("unsupported_type_of_search_key").format(key_type))
3621
-
3622
3475
  valid_search_keys = {}
3623
3476
  unsupported_search_keys = {
3624
3477
  SearchKey.IP_RANGE_FROM,
3625
3478
  SearchKey.IP_RANGE_TO,
3626
- SearchKey.IPV6_RANGE_FROM,
3627
- SearchKey.IPV6_RANGE_TO,
3628
3479
  SearchKey.MSISDN_RANGE_FROM,
3629
3480
  SearchKey.MSISDN_RANGE_TO,
3630
3481
  # SearchKey.EMAIL_ONE_DOMAIN,
@@ -3714,7 +3565,6 @@ class FeaturesEnricher(TransformerMixin):
3714
3565
  print(msg)
3715
3566
  self.logger.warning(msg)
3716
3567
  self.warning_counter.increment()
3717
- # TODO maybe raise ValidationError
3718
3568
 
3719
3569
  self.logger.info(f"Prepared search keys: {valid_search_keys}")
3720
3570