upgini 1.1.312a4__py3-none-any.whl → 1.1.313__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

@@ -61,15 +61,11 @@ from upgini.metadata import (
61
61
  SearchKey,
62
62
  )
63
63
  from upgini.metrics import EstimatorWrapper, validate_scoring_argument
64
- from upgini.normalizer.normalize_utils import Normalizer
65
64
  from upgini.resource_bundle import ResourceBundle, bundle, get_custom_bundle
66
65
  from upgini.search_task import SearchTask
67
66
  from upgini.spinner import Spinner
68
67
  from upgini.utils import combine_search_keys, find_numbers_with_decimal_comma
69
- from upgini.utils.country_utils import (
70
- CountrySearchKeyConverter,
71
- CountrySearchKeyDetector,
72
- )
68
+ from upgini.utils.country_utils import CountrySearchKeyDetector
73
69
  from upgini.utils.custom_loss_utils import (
74
70
  get_additional_params_custom_loss,
75
71
  get_runtime_params_custom_loss,
@@ -91,15 +87,11 @@ from upgini.utils.display_utils import (
91
87
  prepare_and_show_report,
92
88
  show_request_quote_button,
93
89
  )
94
- from upgini.utils.email_utils import EmailDomainGenerator, EmailSearchKeyConverter, EmailSearchKeyDetector
90
+ from upgini.utils.email_utils import EmailSearchKeyConverter, EmailSearchKeyDetector
95
91
  from upgini.utils.features_validator import FeaturesValidator
96
92
  from upgini.utils.format import Format
97
- from upgini.utils.ip_utils import IpSearchKeyConverter
98
- from upgini.utils.phone_utils import PhoneSearchKeyConverter, PhoneSearchKeyDetector
99
- from upgini.utils.postal_code_utils import (
100
- PostalCodeSearchKeyConverter,
101
- PostalCodeSearchKeyDetector,
102
- )
93
+ from upgini.utils.phone_utils import PhoneSearchKeyDetector
94
+ from upgini.utils.postal_code_utils import PostalCodeSearchKeyDetector
103
95
 
104
96
  try:
105
97
  from upgini.utils.progress_bar import CustomProgressBar as ProgressBar
@@ -245,7 +237,6 @@ class FeaturesEnricher(TransformerMixin):
245
237
 
246
238
  self.passed_features: List[str] = []
247
239
  self.df_with_original_index: Optional[pd.DataFrame] = None
248
- self.fit_columns_renaming: Optional[Dict[str, str]] = None
249
240
  self.country_added = False
250
241
  self.fit_generated_features: List[str] = []
251
242
  self.fit_dropped_features: Set[str] = set()
@@ -256,7 +247,7 @@ class FeaturesEnricher(TransformerMixin):
256
247
  self.eval_set: Optional[List[Tuple]] = None
257
248
  self.autodetected_search_keys: Dict[str, SearchKey] = {}
258
249
  self.imbalanced = False
259
- self.__cached_sampled_datasets: Optional[Tuple[pd.DataFrame, pd.DataFrame, pd.Series, Dict, Dict, Dict]] = None
250
+ self.__cached_sampled_datasets: Optional[Tuple[pd.DataFrame, pd.DataFrame, pd.Series, Dict, Dict]] = None
260
251
 
261
252
  validate_version(self.logger)
262
253
  self.search_keys = search_keys or {}
@@ -715,7 +706,7 @@ class FeaturesEnricher(TransformerMixin):
715
706
 
716
707
  start_time = time.time()
717
708
  try:
718
- result, _ = self.__inner_transform(
709
+ result = self.__inner_transform(
719
710
  trace_id,
720
711
  X,
721
712
  exclude_features_sources=exclude_features_sources,
@@ -915,16 +906,8 @@ class FeaturesEnricher(TransformerMixin):
915
906
  search_keys,
916
907
  groups,
917
908
  _cv,
918
- columns_renaming,
919
909
  ) = prepared_data
920
910
 
921
- # rename cat_features
922
- if cat_features:
923
- for new_c, old_c in columns_renaming.items():
924
- if old_c in cat_features:
925
- cat_features.remove(old_c)
926
- cat_features.append(new_c)
927
-
928
911
  gc.collect()
929
912
 
930
913
  print(self.bundle.get("metrics_start"))
@@ -937,7 +920,7 @@ class FeaturesEnricher(TransformerMixin):
937
920
 
938
921
  self._check_train_and_eval_target_distribution(y_sorted, fitting_eval_set_dict)
939
922
 
940
- has_date = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME]) is not None
923
+ has_date = self._get_date_column(search_keys) is not None
941
924
  model_task_type = self.model_task_type or define_task(y_sorted, has_date, self.logger, silent=True)
942
925
 
943
926
  wrapper = EstimatorWrapper.create(
@@ -1130,7 +1113,7 @@ class FeaturesEnricher(TransformerMixin):
1130
1113
  )
1131
1114
 
1132
1115
  uplift_col = self.bundle.get("quality_metrics_uplift_header")
1133
- date_column = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
1116
+ date_column = self._get_date_column(search_keys)
1134
1117
  if (
1135
1118
  uplift_col in metrics_df.columns
1136
1119
  and (metrics_df[uplift_col] < 0).any()
@@ -1212,6 +1195,27 @@ class FeaturesEnricher(TransformerMixin):
1212
1195
  def _has_paid_features(self, exclude_features_sources: Optional[List[str]]) -> bool:
1213
1196
  return self._has_features_with_commercial_schema(CommercialSchema.PAID.value, exclude_features_sources)
1214
1197
 
1198
+ def _extend_x(self, x: pd.DataFrame, is_demo_dataset: bool) -> Tuple[pd.DataFrame, Dict[str, SearchKey]]:
1199
+ search_keys = self.search_keys.copy()
1200
+ search_keys = self.__prepare_search_keys(x, search_keys, is_demo_dataset, is_transform=True, silent_mode=True)
1201
+
1202
+ extended_X = x.copy()
1203
+ generated_features = []
1204
+ date_column = self._get_date_column(search_keys)
1205
+ if date_column is not None:
1206
+ converter = DateTimeSearchKeyConverter(date_column, self.date_format, self.logger, self.bundle)
1207
+ extended_X = converter.convert(extended_X, keep_time=True)
1208
+ generated_features.extend(converter.generated_features)
1209
+ email_column = self._get_email_column(search_keys)
1210
+ hem_column = self._get_hem_column(search_keys)
1211
+ if email_column:
1212
+ converter = EmailSearchKeyConverter(email_column, hem_column, search_keys, [], self.logger)
1213
+ extended_X = converter.convert(extended_X)
1214
+ generated_features.extend(converter.generated_features)
1215
+ generated_features = [f for f in generated_features if f in self.fit_generated_features]
1216
+
1217
+ return extended_X, search_keys
1218
+
1215
1219
  def _is_input_same_as_fit(
1216
1220
  self,
1217
1221
  X: Union[pd.DataFrame, pd.Series, np.ndarray, None] = None,
@@ -1255,7 +1259,7 @@ class FeaturesEnricher(TransformerMixin):
1255
1259
  groups = None
1256
1260
 
1257
1261
  if not isinstance(_cv, BaseCrossValidator):
1258
- date_column = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
1262
+ date_column = self._get_date_column(search_keys)
1259
1263
  date_series = X[date_column] if date_column is not None else None
1260
1264
  _cv, groups = CVConfig(
1261
1265
  _cv, date_series, self.random_state, self._search_task.get_shuffle_kfold(), group_columns=group_columns
@@ -1278,7 +1282,7 @@ class FeaturesEnricher(TransformerMixin):
1278
1282
 
1279
1283
  def _get_client_cat_features(
1280
1284
  self, estimator: Optional[Any], X: pd.DataFrame, search_keys: Dict[str, SearchKey]
1281
- ) -> Tuple[Optional[List[str]], List[str]]:
1285
+ ) -> Optional[List[str]]:
1282
1286
  cat_features = None
1283
1287
  search_keys_for_metrics = []
1284
1288
  if (
@@ -1338,15 +1342,11 @@ class FeaturesEnricher(TransformerMixin):
1338
1342
  progress_bar,
1339
1343
  progress_callback,
1340
1344
  )
1341
- X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys, columns_renaming = dataclasses.astuple(
1342
- sampled_data
1343
- )
1345
+ X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys = dataclasses.astuple(sampled_data)
1344
1346
 
1345
1347
  excluding_search_keys = list(search_keys.keys())
1346
1348
  if search_keys_for_metrics is not None and len(search_keys_for_metrics) > 0:
1347
- for sk in excluding_search_keys:
1348
- if columns_renaming.get(sk) in search_keys_for_metrics:
1349
- excluding_search_keys.remove(sk)
1349
+ excluding_search_keys = [sk for sk in excluding_search_keys if sk not in search_keys_for_metrics]
1350
1350
 
1351
1351
  client_features = [
1352
1352
  c
@@ -1392,7 +1392,6 @@ class FeaturesEnricher(TransformerMixin):
1392
1392
  fitting_X = fitting_X.drop(columns=constant_columns, errors="ignore")
1393
1393
  fitting_enriched_X = fitting_enriched_X.drop(columns=constant_columns, errors="ignore")
1394
1394
 
1395
- # TODO maybe there is no more need for these convertions
1396
1395
  # Remove datetime features
1397
1396
  datetime_features = [
1398
1397
  f for f in fitting_X.columns if is_datetime64_any_dtype(fitting_X[f]) or is_period_dtype(fitting_X[f])
@@ -1480,7 +1479,6 @@ class FeaturesEnricher(TransformerMixin):
1480
1479
  search_keys,
1481
1480
  groups,
1482
1481
  cv,
1483
- columns_renaming,
1484
1482
  )
1485
1483
 
1486
1484
  @dataclass
@@ -1490,7 +1488,6 @@ class FeaturesEnricher(TransformerMixin):
1490
1488
  enriched_X: pd.DataFrame
1491
1489
  eval_set_sampled_dict: Dict[int, Tuple[pd.DataFrame, pd.Series]]
1492
1490
  search_keys: Dict[str, SearchKey]
1493
- columns_renaming: Dict[str, str]
1494
1491
 
1495
1492
  def _sample_data_for_metrics(
1496
1493
  self,
@@ -1530,15 +1527,11 @@ class FeaturesEnricher(TransformerMixin):
1530
1527
  )
1531
1528
 
1532
1529
  def __get_sampled_cached_enriched(self, exclude_features_sources: Optional[List[str]]) -> _SampledDataForMetrics:
1533
- X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys, columns_renaming = (
1534
- self.__cached_sampled_datasets
1535
- )
1530
+ X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys = self.__cached_sampled_datasets
1536
1531
  if exclude_features_sources:
1537
1532
  enriched_X = enriched_X.drop(columns=exclude_features_sources, errors="ignore")
1538
1533
 
1539
- return self.__mk_sampled_data_tuple(
1540
- X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys, columns_renaming
1541
- )
1534
+ return self.__mk_sampled_data_tuple(X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys)
1542
1535
 
1543
1536
  def __sample_only_input(
1544
1537
  self, validated_X: pd.DataFrame, validated_y: pd.Series, eval_set: Optional[List[tuple]], is_demo_dataset: bool
@@ -1556,28 +1549,6 @@ class FeaturesEnricher(TransformerMixin):
1556
1549
  eval_xy[EVAL_SET_INDEX] = idx + 1
1557
1550
  df = pd.concat([df, eval_xy])
1558
1551
 
1559
- search_keys = self.search_keys.copy()
1560
- search_keys = self.__prepare_search_keys(df, search_keys, is_demo_dataset, is_transform=True, silent_mode=True)
1561
-
1562
- date_column = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
1563
- generated_features = []
1564
- if date_column is not None:
1565
- converter = DateTimeSearchKeyConverter(
1566
- date_column, self.date_format, self.logger, self.bundle, silent_mode=True
1567
- )
1568
- df = converter.convert(df, keep_time=True)
1569
- generated_features = converter.generated_features
1570
-
1571
- email_columns = SearchKey.find_all_keys(search_keys, SearchKey.EMAIL)
1572
- if email_columns:
1573
- generator = EmailDomainGenerator(email_columns)
1574
- df = generator.generate(df)
1575
- generated_features.extend(generator.generated_features)
1576
-
1577
- normalizer = Normalizer(self.search_keys, generated_features, self.bundle, self.logger, self.warning_counter)
1578
- df = normalizer.normalize(df)
1579
- columns_renaming = normalizer.columns_renaming
1580
-
1581
1552
  df = clean_full_duplicates(df, logger=self.logger, silent=True, bundle=self.bundle)
1582
1553
 
1583
1554
  num_samples = _num_samples(df)
@@ -1590,34 +1561,24 @@ class FeaturesEnricher(TransformerMixin):
1590
1561
  self.logger.info(f"Downsampling from {num_samples} to {sample_rows}")
1591
1562
  df = df.sample(n=sample_rows, random_state=self.random_state)
1592
1563
 
1593
- df = self.__add_fit_system_record_id(df, search_keys, SYSTEM_RECORD_ID)
1594
- if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
1595
- df = df.drop(columns=DateTimeSearchKeyConverter.DATETIME_COL)
1564
+ df_extended, search_keys = self._extend_x(df, is_demo_dataset)
1565
+ df_extended = self.__add_fit_system_record_id(df_extended, {}, search_keys, SYSTEM_RECORD_ID)
1596
1566
 
1597
- train_df = df.query(f"{EVAL_SET_INDEX} == 0") if eval_set is not None else df
1567
+ train_df = df_extended.query(f"{EVAL_SET_INDEX} == 0") if eval_set is not None else df_extended
1598
1568
  X_sampled = train_df.drop(columns=[TARGET, EVAL_SET_INDEX], errors="ignore")
1599
1569
  y_sampled = train_df[TARGET].copy()
1600
1570
  enriched_X = X_sampled
1601
1571
 
1602
1572
  if eval_set is not None:
1603
1573
  for idx in range(len(eval_set)):
1604
- eval_xy_sampled = df.query(f"{EVAL_SET_INDEX} == {idx + 1}")
1574
+ eval_xy_sampled = df_extended.query(f"{EVAL_SET_INDEX} == {idx + 1}")
1605
1575
  eval_X_sampled = eval_xy_sampled.drop(columns=[TARGET, EVAL_SET_INDEX], errors="ignore")
1606
1576
  eval_y_sampled = eval_xy_sampled[TARGET].copy()
1607
1577
  enriched_eval_X = eval_X_sampled
1608
1578
  eval_set_sampled_dict[idx] = (eval_X_sampled, enriched_eval_X, eval_y_sampled)
1609
- self.__cached_sampled_datasets = (
1610
- X_sampled,
1611
- y_sampled,
1612
- enriched_X,
1613
- eval_set_sampled_dict,
1614
- search_keys,
1615
- columns_renaming,
1616
- )
1579
+ self.__cached_sampled_datasets = (X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys)
1617
1580
 
1618
- return self.__mk_sampled_data_tuple(
1619
- X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys, columns_renaming
1620
- )
1581
+ return self.__mk_sampled_data_tuple(X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys)
1621
1582
 
1622
1583
  def __sample_balanced(
1623
1584
  self,
@@ -1629,7 +1590,7 @@ class FeaturesEnricher(TransformerMixin):
1629
1590
  search_keys = self.fit_search_keys
1630
1591
 
1631
1592
  rows_to_drop = None
1632
- has_date = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME]) is not None
1593
+ has_date = self._get_date_column(search_keys) is not None
1633
1594
  task_type = self.model_task_type or define_task(
1634
1595
  self.df_with_original_index[TARGET], has_date, self.logger, silent=True
1635
1596
  )
@@ -1683,18 +1644,9 @@ class FeaturesEnricher(TransformerMixin):
1683
1644
  enriched_eval_X = enriched_eval_sets[idx + 1].drop(columns=[TARGET, EVAL_SET_INDEX])
1684
1645
  eval_set_sampled_dict[idx] = (eval_X_sampled, enriched_eval_X, eval_y_sampled)
1685
1646
 
1686
- self.__cached_sampled_datasets = (
1687
- X_sampled,
1688
- y_sampled,
1689
- enriched_X,
1690
- eval_set_sampled_dict,
1691
- search_keys,
1692
- self.fit_columns_renaming,
1693
- )
1647
+ self.__cached_sampled_datasets = (X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys)
1694
1648
 
1695
- return self.__mk_sampled_data_tuple(
1696
- X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys, self.fit_columns_renaming
1697
- )
1649
+ return self.__mk_sampled_data_tuple(X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys)
1698
1650
 
1699
1651
  def __sample_imbalanced(
1700
1652
  self,
@@ -1734,7 +1686,7 @@ class FeaturesEnricher(TransformerMixin):
1734
1686
  tmp_target_name = "__target"
1735
1687
  df = df.rename(columns={TARGET: tmp_target_name})
1736
1688
 
1737
- enriched_df, columns_renaming = self.__inner_transform(
1689
+ enriched_df = self.__inner_transform(
1738
1690
  trace_id,
1739
1691
  df,
1740
1692
  exclude_features_sources=exclude_features_sources,
@@ -1782,7 +1734,7 @@ class FeaturesEnricher(TransformerMixin):
1782
1734
  tmp_target_name = "__target"
1783
1735
  df = df.rename(columns={TARGET: tmp_target_name})
1784
1736
 
1785
- enriched_Xy, columns_renaming = self.__inner_transform(
1737
+ enriched_Xy = self.__inner_transform(
1786
1738
  trace_id,
1787
1739
  df,
1788
1740
  exclude_features_sources=exclude_features_sources,
@@ -1807,18 +1759,9 @@ class FeaturesEnricher(TransformerMixin):
1807
1759
  y_sampled = enriched_Xy[TARGET].copy()
1808
1760
  enriched_X = enriched_Xy.drop(columns=TARGET)
1809
1761
 
1810
- self.__cached_sampled_datasets = (
1811
- X_sampled,
1812
- y_sampled,
1813
- enriched_X,
1814
- eval_set_sampled_dict,
1815
- self.search_keys,
1816
- columns_renaming,
1817
- )
1762
+ self.__cached_sampled_datasets = (X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, self.search_keys)
1818
1763
 
1819
- return self.__mk_sampled_data_tuple(
1820
- X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, self.search_keys, columns_renaming
1821
- )
1764
+ return self.__mk_sampled_data_tuple(X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, self.search_keys)
1822
1765
 
1823
1766
  def __mk_sampled_data_tuple(
1824
1767
  self,
@@ -1827,7 +1770,6 @@ class FeaturesEnricher(TransformerMixin):
1827
1770
  enriched_X: pd.DataFrame,
1828
1771
  eval_set_sampled_dict: Dict,
1829
1772
  search_keys: Dict,
1830
- columns_renaming: Dict[str, str],
1831
1773
  ):
1832
1774
  search_keys = {k: v for k, v in search_keys.items() if k in X_sampled.columns.to_list()}
1833
1775
  return FeaturesEnricher._SampledDataForMetrics(
@@ -1836,7 +1778,6 @@ class FeaturesEnricher(TransformerMixin):
1836
1778
  enriched_X=enriched_X,
1837
1779
  eval_set_sampled_dict=eval_set_sampled_dict,
1838
1780
  search_keys=search_keys,
1839
- columns_renaming=columns_renaming,
1840
1781
  )
1841
1782
 
1842
1783
  def get_search_id(self) -> Optional[str]:
@@ -1925,7 +1866,7 @@ class FeaturesEnricher(TransformerMixin):
1925
1866
  progress_bar: Optional[ProgressBar] = None,
1926
1867
  progress_callback: Optional[Callable[[SearchProgress], Any]] = None,
1927
1868
  add_fit_system_record_id: bool = False,
1928
- ) -> Tuple[pd.DataFrame, Dict[str, str]]:
1869
+ ) -> pd.DataFrame:
1929
1870
  if self._search_task is None:
1930
1871
  raise NotFittedError(self.bundle.get("transform_unfitted_enricher"))
1931
1872
 
@@ -1938,13 +1879,13 @@ class FeaturesEnricher(TransformerMixin):
1938
1879
 
1939
1880
  if len(self.feature_names_) == 0:
1940
1881
  self.logger.warning(self.bundle.get("no_important_features_for_transform"))
1941
- return X, {c: c for c in X.columns}
1882
+ return X
1942
1883
 
1943
1884
  if self._has_paid_features(exclude_features_sources):
1944
1885
  msg = self.bundle.get("transform_with_paid_features")
1945
1886
  self.logger.warning(msg)
1946
1887
  self.__display_support_link(msg)
1947
- return None, {c: c for c in X.columns}
1888
+ return None
1948
1889
 
1949
1890
  if not metrics_calculation:
1950
1891
  transform_usage = self.rest_client.get_current_transform_usage(trace_id)
@@ -1955,7 +1896,7 @@ class FeaturesEnricher(TransformerMixin):
1955
1896
  self.logger.warning(msg)
1956
1897
  print(msg)
1957
1898
  show_request_quote_button()
1958
- return None, {c: c for c in X.columns}
1899
+ return None
1959
1900
  else:
1960
1901
  msg = self.bundle.get("transform_usage_info").format(
1961
1902
  transform_usage.limit, transform_usage.transformed_rows
@@ -1993,11 +1934,9 @@ class FeaturesEnricher(TransformerMixin):
1993
1934
  df = self.__add_country_code(df, search_keys)
1994
1935
 
1995
1936
  generated_features = []
1996
- date_column = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
1937
+ date_column = self._get_date_column(search_keys)
1997
1938
  if date_column is not None:
1998
- converter = DateTimeSearchKeyConverter(
1999
- date_column, self.date_format, self.logger, bundle=self.bundle, silent_mode=silent_mode
2000
- )
1939
+ converter = DateTimeSearchKeyConverter(date_column, self.date_format, self.logger, bundle=self.bundle)
2001
1940
  df = converter.convert(df)
2002
1941
  self.logger.info(f"Date column after convertion: {df[date_column]}")
2003
1942
  generated_features.extend(converter.generated_features)
@@ -2006,93 +1945,61 @@ class FeaturesEnricher(TransformerMixin):
2006
1945
  if self.add_date_if_missing:
2007
1946
  df = self._add_current_date_as_key(df, search_keys, self.logger, self.bundle)
2008
1947
 
2009
- email_columns = SearchKey.find_all_keys(search_keys, SearchKey.EMAIL)
2010
- if email_columns:
2011
- generator = EmailDomainGenerator(email_columns)
2012
- df = generator.generate(df)
2013
- generated_features.extend(generator.generated_features)
2014
-
2015
- normalizer = Normalizer(
2016
- search_keys, generated_features, self.bundle, self.logger, self.warning_counter, silent_mode
2017
- )
2018
- df = normalizer.normalize(df)
2019
- columns_renaming = normalizer.columns_renaming
2020
-
2021
1948
  # Don't pass all features in backend on transform
1949
+ original_features_for_transform = []
2022
1950
  runtime_parameters = self._get_copy_of_runtime_parameters()
2023
- features_for_transform = self._search_task.get_features_for_transform() or []
2024
- if len(features_for_transform) > 0:
2025
- runtime_parameters.properties["features_for_embeddings"] = ",".join(features_for_transform)
1951
+ features_not_to_pass = [column for column in df.columns if column not in search_keys.keys()]
1952
+ if len(features_not_to_pass) > 0:
1953
+ # Pass only features that need for transform
1954
+ features_for_transform = self._search_task.get_features_for_transform()
1955
+ if features_for_transform is not None and len(features_for_transform) > 0:
1956
+ file_metadata = self._search_task.get_file_metadata(trace_id)
1957
+ original_features_for_transform = [
1958
+ c.originalName or c.name for c in file_metadata.columns if c.name in features_for_transform
1959
+ ]
1960
+
1961
+ runtime_parameters.properties["features_for_embeddings"] = ",".join(features_for_transform)
2026
1962
 
2027
- columns_for_system_record_id = sorted(list(search_keys.keys()) + features_for_transform)
1963
+ columns_for_system_record_id = sorted(list(search_keys.keys()) + (original_features_for_transform))
2028
1964
 
2029
1965
  df[ENTITY_SYSTEM_RECORD_ID] = pd.util.hash_pandas_object(
2030
1966
  df[columns_for_system_record_id], index=False
2031
1967
  ).astype("Float64")
2032
1968
 
2033
1969
  # Explode multiple search keys
2034
- df, unnest_search_keys = self._explode_multiple_search_keys(df, search_keys, columns_renaming)
1970
+ df, unnest_search_keys = self._explode_multiple_search_keys(df, search_keys)
2035
1971
 
2036
1972
  email_column = self._get_email_column(search_keys)
2037
1973
  hem_column = self._get_hem_column(search_keys)
1974
+ email_converted_to_hem = False
2038
1975
  if email_column:
2039
1976
  converter = EmailSearchKeyConverter(
2040
- email_column,
2041
- hem_column,
2042
- search_keys,
2043
- columns_renaming,
2044
- list(unnest_search_keys.keys()),
2045
- self.logger,
1977
+ email_column, hem_column, search_keys, list(unnest_search_keys.keys()), self.logger
2046
1978
  )
2047
1979
  df = converter.convert(df)
2048
-
2049
- ip_column = self._get_ip_column(search_keys)
2050
- if ip_column:
2051
- converter = IpSearchKeyConverter(
2052
- ip_column,
2053
- search_keys,
2054
- columns_renaming,
2055
- list(unnest_search_keys.keys()),
2056
- self.bundle,
2057
- self.logger,
2058
- )
2059
- df = converter.convert(df)
2060
-
2061
- phone_column = self._get_phone_column(search_keys)
2062
- country_column = self._get_country_column(search_keys)
2063
- if phone_column:
2064
- converter = PhoneSearchKeyConverter(phone_column, country_column)
2065
- df = converter.convert(df)
2066
-
2067
- if country_column:
2068
- converter = CountrySearchKeyConverter(country_column)
2069
- df = converter.convert(df)
2070
-
2071
- postal_code = self._get_postal_column(search_keys)
2072
- if postal_code:
2073
- converter = PostalCodeSearchKeyConverter(postal_code)
2074
- df = converter.convert(df)
2075
-
1980
+ generated_features.extend(converter.generated_features)
1981
+ email_converted_to_hem = converter.email_converted_to_hem
2076
1982
  generated_features = [f for f in generated_features if f in self.fit_generated_features]
2077
1983
 
2078
1984
  meaning_types = {col: key.value for col, key in search_keys.items()}
2079
- for col in features_for_transform:
1985
+ # non_keys_columns = [column for column in df.columns if column not in search_keys.keys()]
1986
+ for col in original_features_for_transform:
2080
1987
  meaning_types[col] = FileColumnMeaningType.FEATURE
2081
- features_not_to_pass = [
2082
- c
2083
- for c in df.columns
2084
- if c not in search_keys.keys() and c not in features_for_transform and c != ENTITY_SYSTEM_RECORD_ID
2085
- ]
1988
+ features_not_to_pass = [column for column in features_not_to_pass if column not in search_keys.keys()]
1989
+
1990
+ if email_converted_to_hem:
1991
+ features_not_to_pass.append(email_column)
1992
+
1993
+ features_not_to_pass = [c for c in features_not_to_pass if c not in original_features_for_transform]
1994
+ columns_for_system_record_id = sorted(list(search_keys.keys()) + (original_features_for_transform))
2086
1995
 
2087
1996
  if add_fit_system_record_id:
2088
- df = self.__add_fit_system_record_id(df, search_keys, SYSTEM_RECORD_ID)
2089
- if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
2090
- df = df.drop(columns=DateTimeSearchKeyConverter.DATETIME_COL)
1997
+ df = self.__add_fit_system_record_id(df, {}, search_keys, SYSTEM_RECORD_ID)
2091
1998
  df = df.rename(columns={SYSTEM_RECORD_ID: SORT_ID})
2092
1999
  features_not_to_pass.append(SORT_ID)
2093
2000
 
2094
- # search keys might be changed after explode
2095
- columns_for_system_record_id = sorted(list(search_keys.keys()) + features_for_transform)
2001
+ columns_for_system_record_id = sorted(list(search_keys.keys()) + (original_features_for_transform or []))
2002
+
2096
2003
  df[SYSTEM_RECORD_ID] = pd.util.hash_pandas_object(df[columns_for_system_record_id], index=False).astype(
2097
2004
  "Float64"
2098
2005
  )
@@ -2128,7 +2035,8 @@ class FeaturesEnricher(TransformerMixin):
2128
2035
  rest_client=self.rest_client,
2129
2036
  logger=self.logger,
2130
2037
  )
2131
- dataset.columns_renaming = columns_renaming
2038
+ if email_converted_to_hem:
2039
+ dataset.ignore_columns = [email_column]
2132
2040
 
2133
2041
  if max_features is not None or importance_threshold is not None:
2134
2042
  exclude_features_sources = list(
@@ -2230,7 +2138,7 @@ class FeaturesEnricher(TransformerMixin):
2230
2138
  if add_fit_system_record_id:
2231
2139
  result = result.rename(columns={SORT_ID: SYSTEM_RECORD_ID})
2232
2140
 
2233
- return result, columns_renaming
2141
+ return result
2234
2142
 
2235
2143
  def _get_excluded_features(self, max_features: Optional[int], importance_threshold: Optional[float]) -> List[str]:
2236
2144
  features_info = self._internal_features_info
@@ -2331,9 +2239,6 @@ class FeaturesEnricher(TransformerMixin):
2331
2239
  self.df_with_original_index = None
2332
2240
  self.__cached_sampled_datasets = None
2333
2241
  self.metrics = None
2334
- self.fit_columns_renaming = None
2335
- self.fit_dropped_features = set()
2336
- self.fit_generated_features = []
2337
2242
 
2338
2243
  validated_X = self._validate_X(X)
2339
2244
  validated_y = self._validate_y(validated_X, y)
@@ -2380,10 +2285,9 @@ class FeaturesEnricher(TransformerMixin):
2380
2285
  self.fit_search_keys = self.search_keys.copy()
2381
2286
  self.fit_search_keys = self.__prepare_search_keys(validated_X, self.fit_search_keys, is_demo_dataset)
2382
2287
 
2383
- maybe_date_column = SearchKey.find_key(self.fit_search_keys, [SearchKey.DATE, SearchKey.DATETIME])
2288
+ maybe_date_column = self._get_date_column(self.fit_search_keys)
2384
2289
  has_date = maybe_date_column is not None
2385
2290
  model_task_type = self.model_task_type or define_task(validated_y, has_date, self.logger)
2386
-
2387
2291
  self._validate_binary_observations(validated_y, model_task_type)
2388
2292
 
2389
2293
  self.runtime_parameters = get_runtime_params_custom_loss(
@@ -2413,13 +2317,7 @@ class FeaturesEnricher(TransformerMixin):
2413
2317
  self.fit_generated_features = []
2414
2318
 
2415
2319
  if has_date:
2416
- converter = DateTimeSearchKeyConverter(
2417
- maybe_date_column,
2418
- self.date_format,
2419
- self.logger,
2420
- bundle=self.bundle,
2421
- warnings_counter=self.warning_counter,
2422
- )
2320
+ converter = DateTimeSearchKeyConverter(maybe_date_column, self.date_format, self.logger, bundle=self.bundle)
2423
2321
  df = converter.convert(df, keep_time=True)
2424
2322
  self.logger.info(f"Date column after convertion: {df[maybe_date_column]}")
2425
2323
  self.fit_generated_features.extend(converter.generated_features)
@@ -2428,14 +2326,6 @@ class FeaturesEnricher(TransformerMixin):
2428
2326
  if self.add_date_if_missing:
2429
2327
  df = self._add_current_date_as_key(df, self.fit_search_keys, self.logger, self.bundle)
2430
2328
 
2431
- email_columns = SearchKey.find_all_keys(self.fit_search_keys, SearchKey.EMAIL)
2432
- if email_columns:
2433
- generator = EmailDomainGenerator(
2434
- email_columns
2435
- )
2436
- df = generator.generate(df)
2437
- self.fit_generated_features.extend(generator.generated_features)
2438
-
2439
2329
  # Checks that need validated date
2440
2330
  validate_dates_distribution(df, self.fit_search_keys, self.logger, self.bundle, self.warning_counter)
2441
2331
 
@@ -2444,12 +2334,7 @@ class FeaturesEnricher(TransformerMixin):
2444
2334
 
2445
2335
  self.__adjust_cv(df, maybe_date_column, model_task_type)
2446
2336
 
2447
- normalizer = Normalizer(
2448
- self.fit_search_keys, self.fit_generated_features, self.bundle, self.logger, self.warning_counter
2449
- )
2450
- df = normalizer.normalize(df)
2451
- columns_renaming = normalizer.columns_renaming
2452
- self.fit_columns_renaming = columns_renaming
2337
+ # TODO normalize and convert all columns
2453
2338
 
2454
2339
  df = remove_fintech_duplicates(
2455
2340
  df, self.fit_search_keys, date_format=self.date_format, logger=self.logger, bundle=self.bundle
@@ -2457,58 +2342,38 @@ class FeaturesEnricher(TransformerMixin):
2457
2342
  df = clean_full_duplicates(df, self.logger, bundle=self.bundle)
2458
2343
 
2459
2344
  # Explode multiple search keys
2460
- df = self.__add_fit_system_record_id(df, self.fit_search_keys, ENTITY_SYSTEM_RECORD_ID)
2345
+ non_feature_columns = [self.TARGET_NAME, EVAL_SET_INDEX] + list(self.fit_search_keys.keys())
2346
+ meaning_types = {
2347
+ **{col: key.value for col, key in self.fit_search_keys.items()},
2348
+ **{str(c): FileColumnMeaningType.FEATURE for c in df.columns if c not in non_feature_columns},
2349
+ }
2350
+ meaning_types[self.TARGET_NAME] = FileColumnMeaningType.TARGET
2351
+ if eval_set is not None and len(eval_set) > 0:
2352
+ meaning_types[EVAL_SET_INDEX] = FileColumnMeaningType.EVAL_SET_INDEX
2353
+ df = self.__add_fit_system_record_id(df, meaning_types, self.fit_search_keys, ENTITY_SYSTEM_RECORD_ID)
2461
2354
 
2462
2355
  # TODO check that this is correct for enrichment
2463
2356
  self.df_with_original_index = df.copy()
2464
- # TODO check maybe need to drop _time column from df_with_original_index
2465
2357
 
2466
- df, unnest_search_keys = self._explode_multiple_search_keys(df, self.fit_search_keys, columns_renaming)
2358
+ df, unnest_search_keys = self._explode_multiple_search_keys(df, self.fit_search_keys)
2467
2359
 
2468
2360
  # Convert EMAIL to HEM after unnesting to do it only with one column
2469
2361
  email_column = self._get_email_column(self.fit_search_keys)
2470
2362
  hem_column = self._get_hem_column(self.fit_search_keys)
2363
+ email_converted_to_hem = False
2471
2364
  if email_column:
2472
2365
  converter = EmailSearchKeyConverter(
2473
- email_column,
2474
- hem_column,
2475
- self.fit_search_keys,
2476
- columns_renaming,
2477
- list(unnest_search_keys.keys()),
2478
- self.logger,
2479
- )
2480
- df = converter.convert(df)
2481
-
2482
- ip_column = self._get_ip_column(self.fit_search_keys)
2483
- if ip_column:
2484
- converter = IpSearchKeyConverter(
2485
- ip_column,
2486
- self.fit_search_keys,
2487
- columns_renaming,
2488
- list(unnest_search_keys.keys()),
2489
- self.bundle,
2490
- self.logger,
2366
+ email_column, hem_column, self.fit_search_keys, list(unnest_search_keys.keys()), self.logger
2491
2367
  )
2492
2368
  df = converter.convert(df)
2493
-
2494
- phone_column = self._get_phone_column(self.fit_search_keys)
2495
- country_column = self._get_country_column(self.fit_search_keys)
2496
- if phone_column:
2497
- converter = PhoneSearchKeyConverter(phone_column, country_column)
2498
- df = converter.convert(df)
2499
-
2500
- if country_column:
2501
- converter = CountrySearchKeyConverter(country_column)
2502
- df = converter.convert(df)
2503
-
2504
- postal_code = self._get_postal_column(self.fit_search_keys)
2505
- if postal_code:
2506
- converter = PostalCodeSearchKeyConverter(postal_code)
2507
- df = converter.convert(df)
2369
+ self.fit_generated_features.extend(converter.generated_features)
2370
+ email_converted_to_hem = converter.email_converted_to_hem
2508
2371
 
2509
2372
  non_feature_columns = [self.TARGET_NAME, EVAL_SET_INDEX, ENTITY_SYSTEM_RECORD_ID, SEARCH_KEY_UNNEST] + list(
2510
2373
  self.fit_search_keys.keys()
2511
2374
  )
2375
+ if email_converted_to_hem:
2376
+ non_feature_columns.append(email_column)
2512
2377
  if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
2513
2378
  non_feature_columns.append(DateTimeSearchKeyConverter.DATETIME_COL)
2514
2379
 
@@ -2520,6 +2385,9 @@ class FeaturesEnricher(TransformerMixin):
2520
2385
  self.fit_dropped_features.update(features_to_drop)
2521
2386
  df = df.drop(columns=features_to_drop)
2522
2387
 
2388
+ if email_converted_to_hem:
2389
+ self.fit_dropped_features.add(email_column)
2390
+
2523
2391
  self.fit_generated_features = [f for f in self.fit_generated_features if f not in self.fit_dropped_features]
2524
2392
 
2525
2393
  meaning_types = {
@@ -2533,12 +2401,7 @@ class FeaturesEnricher(TransformerMixin):
2533
2401
  if eval_set is not None and len(eval_set) > 0:
2534
2402
  meaning_types[EVAL_SET_INDEX] = FileColumnMeaningType.EVAL_SET_INDEX
2535
2403
 
2536
- df = self.__add_fit_system_record_id(df, self.fit_search_keys, SYSTEM_RECORD_ID)
2537
-
2538
- if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
2539
- df = df.drop(columns=DateTimeSearchKeyConverter.DATETIME_COL)
2540
-
2541
- meaning_types[SYSTEM_RECORD_ID] = FileColumnMeaningType.SYSTEM_RECORD_ID
2404
+ df = self.__add_fit_system_record_id(df, meaning_types, self.fit_search_keys, SYSTEM_RECORD_ID)
2542
2405
 
2543
2406
  df = df.reset_index(drop=True).sort_values(by=SYSTEM_RECORD_ID).reset_index(drop=True)
2544
2407
 
@@ -2556,7 +2419,8 @@ class FeaturesEnricher(TransformerMixin):
2556
2419
  rest_client=self.rest_client,
2557
2420
  logger=self.logger,
2558
2421
  )
2559
- dataset.columns_renaming = columns_renaming
2422
+ if email_converted_to_hem:
2423
+ dataset.ignore_columns = [email_column]
2560
2424
 
2561
2425
  self.passed_features = [
2562
2426
  column for column, meaning_type in meaning_types.items() if meaning_type == FileColumnMeaningType.FEATURE
@@ -2945,7 +2809,7 @@ class FeaturesEnricher(TransformerMixin):
2945
2809
  if DateTimeSearchKeyConverter.DATETIME_COL in X.columns:
2946
2810
  date_column = DateTimeSearchKeyConverter.DATETIME_COL
2947
2811
  else:
2948
- date_column = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
2812
+ date_column = FeaturesEnricher._get_date_column(search_keys)
2949
2813
  sort_columns = [date_column] if date_column is not None else []
2950
2814
 
2951
2815
  # Xy = pd.concat([X, y], axis=1)
@@ -3041,10 +2905,10 @@ class FeaturesEnricher(TransformerMixin):
3041
2905
 
3042
2906
  do_without_pandas_limits(print_datasets_sample)
3043
2907
 
3044
- maybe_date_col = SearchKey.find_key(self.search_keys, [SearchKey.DATE, SearchKey.DATETIME])
2908
+ maybe_date_col = self._get_date_column(self.search_keys)
3045
2909
  if X is not None and maybe_date_col is not None and maybe_date_col in X.columns:
3046
2910
  # TODO cast date column to single dtype
3047
- date_converter = DateTimeSearchKeyConverter(maybe_date_col, self.date_format, silent_mode=True)
2911
+ date_converter = DateTimeSearchKeyConverter(maybe_date_col, self.date_format)
3048
2912
  converted_X = date_converter.convert(X)
3049
2913
  min_date = converted_X[maybe_date_col].min()
3050
2914
  max_date = converted_X[maybe_date_col].max()
@@ -3071,6 +2935,12 @@ class FeaturesEnricher(TransformerMixin):
3071
2935
 
3072
2936
  return df
3073
2937
 
2938
+ @staticmethod
2939
+ def _get_date_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
2940
+ for col, t in search_keys.items():
2941
+ if t in [SearchKey.DATE, SearchKey.DATETIME]:
2942
+ return col
2943
+
3074
2944
  @staticmethod
3075
2945
  def _add_current_date_as_key(
3076
2946
  df: pd.DataFrame, search_keys: Dict[str, SearchKey], logger: logging.Logger, bundle: ResourceBundle
@@ -3086,7 +2956,7 @@ class FeaturesEnricher(TransformerMixin):
3086
2956
  logger.warning(msg)
3087
2957
  df[FeaturesEnricher.CURRENT_DATE] = datetime.date.today()
3088
2958
  search_keys[FeaturesEnricher.CURRENT_DATE] = SearchKey.DATE
3089
- converter = DateTimeSearchKeyConverter(FeaturesEnricher.CURRENT_DATE, silent_mode=True)
2959
+ converter = DateTimeSearchKeyConverter(FeaturesEnricher.CURRENT_DATE, None, logger, bundle)
3090
2960
  df = converter.convert(df)
3091
2961
  return df
3092
2962
 
@@ -3114,37 +2984,17 @@ class FeaturesEnricher(TransformerMixin):
3114
2984
  if len(cols) == 1:
3115
2985
  return cols[0]
3116
2986
 
3117
- @staticmethod
3118
- def _get_ip_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
3119
- cols = [col for col, t in search_keys.items() if t == SearchKey.IP]
3120
- if len(cols) > 1:
3121
- raise Exception("More than one ip column found after unnest")
3122
- if len(cols) == 1:
3123
- return cols[0]
3124
-
3125
2987
  @staticmethod
3126
2988
  def _get_phone_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
3127
2989
  for col, t in search_keys.items():
3128
2990
  if t == SearchKey.PHONE:
3129
2991
  return col
3130
2992
 
3131
- @staticmethod
3132
- def _get_country_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
3133
- for col, t in search_keys.items():
3134
- if t == SearchKey.COUNTRY:
3135
- return col
3136
-
3137
- @staticmethod
3138
- def _get_postal_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
3139
- for col, t in search_keys.items():
3140
- if t == SearchKey.POSTAL_CODE:
3141
- return col
3142
-
3143
2993
  def _explode_multiple_search_keys(
3144
- self, df: pd.DataFrame, search_keys: Dict[str, SearchKey], columns_renaming: Dict[str, str]
2994
+ self, df: pd.DataFrame, search_keys: Dict[str, SearchKey]
3145
2995
  ) -> Tuple[pd.DataFrame, Dict[str, List[str]]]:
3146
2996
  # find groups of multiple search keys
3147
- search_key_names_by_type: Dict[SearchKey, List[str]] = {}
2997
+ search_key_names_by_type: Dict[SearchKey, str] = {}
3148
2998
  for key_name, key_type in search_keys.items():
3149
2999
  search_key_names_by_type[key_type] = search_key_names_by_type.get(key_type, []) + [key_name]
3150
3000
  search_key_names_by_type = {
@@ -3168,7 +3018,6 @@ class FeaturesEnricher(TransformerMixin):
3168
3018
  del search_keys[old_key]
3169
3019
  search_keys[new_search_key] = key_type
3170
3020
  unnest_search_keys[new_search_key] = key_names
3171
- columns_renaming[new_search_key] = new_search_key
3172
3021
 
3173
3022
  df = pd.concat(exploded_dfs, ignore_index=True)
3174
3023
  return df, unnest_search_keys
@@ -3176,7 +3025,7 @@ class FeaturesEnricher(TransformerMixin):
3176
3025
  def __add_fit_system_record_id(
3177
3026
  self,
3178
3027
  df: pd.DataFrame,
3179
- # meaning_types: Dict[str, FileColumnMeaningType],
3028
+ meaning_types: Dict[str, FileColumnMeaningType],
3180
3029
  search_keys: Dict[str, SearchKey],
3181
3030
  id_name: str,
3182
3031
  ) -> pd.DataFrame:
@@ -3199,9 +3048,9 @@ class FeaturesEnricher(TransformerMixin):
3199
3048
  ]
3200
3049
  if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
3201
3050
  date_column = DateTimeSearchKeyConverter.DATETIME_COL
3202
- sort_exclude_columns.append(SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME]))
3051
+ sort_exclude_columns.append(self._get_date_column(search_keys))
3203
3052
  else:
3204
- date_column = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
3053
+ date_column = self._get_date_column(search_keys)
3205
3054
  sort_columns = [date_column] if date_column is not None else []
3206
3055
 
3207
3056
  other_columns = sorted(
@@ -3210,6 +3059,13 @@ class FeaturesEnricher(TransformerMixin):
3210
3059
  for c in df.columns
3211
3060
  if c not in sort_columns and c not in sort_exclude_columns and df[c].nunique() > 1
3212
3061
  ]
3062
+ # [
3063
+ # sk
3064
+ # for sk, key_type in search_keys.items()
3065
+ # if key_type not in [SearchKey.DATE, SearchKey.DATETIME]
3066
+ # and sk in df.columns
3067
+ # and df[sk].nunique() > 1 # don't use constant keys for hash
3068
+ # ]
3213
3069
  )
3214
3070
 
3215
3071
  search_keys_hash = "search_keys_hash"
@@ -3222,6 +3078,9 @@ class FeaturesEnricher(TransformerMixin):
3222
3078
  if search_keys_hash in df.columns:
3223
3079
  df.drop(columns=search_keys_hash, inplace=True)
3224
3080
 
3081
+ if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
3082
+ df.drop(columns=DateTimeSearchKeyConverter.DATETIME_COL, inplace=True)
3083
+
3225
3084
  df = df.reset_index(drop=True).reset_index()
3226
3085
  # system_record_id saves correct order for fit
3227
3086
  df = df.rename(columns={DEFAULT_INDEX: id_name})
@@ -3231,11 +3090,11 @@ class FeaturesEnricher(TransformerMixin):
3231
3090
  df.index.name = original_index_name
3232
3091
  df = df.sort_values(by=original_order_name).drop(columns=original_order_name)
3233
3092
 
3234
- # meaning_types[id_name] = (
3235
- # FileColumnMeaningType.SYSTEM_RECORD_ID
3236
- # if id_name == SYSTEM_RECORD_ID
3237
- # else FileColumnMeaningType.ENTITY_SYSTEM_RECORD_ID
3238
- # )
3093
+ meaning_types[id_name] = (
3094
+ FileColumnMeaningType.SYSTEM_RECORD_ID
3095
+ if id_name == SYSTEM_RECORD_ID
3096
+ else FileColumnMeaningType.ENTITY_SYSTEM_RECORD_ID
3097
+ )
3239
3098
  return df
3240
3099
 
3241
3100
  def __correct_target(self, df: pd.DataFrame) -> pd.DataFrame:
@@ -3613,13 +3472,10 @@ class FeaturesEnricher(TransformerMixin):
3613
3472
  for _, key_type in search_keys.items():
3614
3473
  if not isinstance(key_type, SearchKey):
3615
3474
  raise ValidationError(self.bundle.get("unsupported_type_of_search_key").format(key_type))
3616
-
3617
3475
  valid_search_keys = {}
3618
3476
  unsupported_search_keys = {
3619
3477
  SearchKey.IP_RANGE_FROM,
3620
3478
  SearchKey.IP_RANGE_TO,
3621
- SearchKey.IPV6_RANGE_FROM,
3622
- SearchKey.IPV6_RANGE_TO,
3623
3479
  SearchKey.MSISDN_RANGE_FROM,
3624
3480
  SearchKey.MSISDN_RANGE_TO,
3625
3481
  # SearchKey.EMAIL_ONE_DOMAIN,
@@ -3709,7 +3565,6 @@ class FeaturesEnricher(TransformerMixin):
3709
3565
  print(msg)
3710
3566
  self.logger.warning(msg)
3711
3567
  self.warning_counter.increment()
3712
- # TODO maybe raise ValidationError
3713
3568
 
3714
3569
  self.logger.info(f"Prepared search keys: {valid_search_keys}")
3715
3570