upgini 1.1.315a3579.dev1__py3-none-any.whl → 1.1.316__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

@@ -61,11 +61,15 @@ from upgini.metadata import (
61
61
  SearchKey,
62
62
  )
63
63
  from upgini.metrics import EstimatorWrapper, validate_scoring_argument
64
+ from upgini.normalizer.normalize_utils import Normalizer
64
65
  from upgini.resource_bundle import ResourceBundle, bundle, get_custom_bundle
65
66
  from upgini.search_task import SearchTask
66
67
  from upgini.spinner import Spinner
67
68
  from upgini.utils import combine_search_keys, find_numbers_with_decimal_comma
68
- from upgini.utils.country_utils import CountrySearchKeyDetector
69
+ from upgini.utils.country_utils import (
70
+ CountrySearchKeyConverter,
71
+ CountrySearchKeyDetector,
72
+ )
69
73
  from upgini.utils.custom_loss_utils import (
70
74
  get_additional_params_custom_loss,
71
75
  get_runtime_params_custom_loss,
@@ -87,11 +91,19 @@ from upgini.utils.display_utils import (
87
91
  prepare_and_show_report,
88
92
  show_request_quote_button,
89
93
  )
90
- from upgini.utils.email_utils import EmailSearchKeyConverter, EmailSearchKeyDetector
94
+ from upgini.utils.email_utils import (
95
+ EmailDomainGenerator,
96
+ EmailSearchKeyConverter,
97
+ EmailSearchKeyDetector,
98
+ )
91
99
  from upgini.utils.features_validator import FeaturesValidator
92
100
  from upgini.utils.format import Format
93
- from upgini.utils.phone_utils import PhoneSearchKeyDetector
94
- from upgini.utils.postal_code_utils import PostalCodeSearchKeyDetector
101
+ from upgini.utils.ip_utils import IpSearchKeyConverter
102
+ from upgini.utils.phone_utils import PhoneSearchKeyConverter, PhoneSearchKeyDetector
103
+ from upgini.utils.postal_code_utils import (
104
+ PostalCodeSearchKeyConverter,
105
+ PostalCodeSearchKeyDetector,
106
+ )
95
107
 
96
108
  try:
97
109
  from upgini.utils.progress_bar import CustomProgressBar as ProgressBar
@@ -237,6 +249,7 @@ class FeaturesEnricher(TransformerMixin):
237
249
 
238
250
  self.passed_features: List[str] = []
239
251
  self.df_with_original_index: Optional[pd.DataFrame] = None
252
+ self.fit_columns_renaming: Optional[Dict[str, str]] = None
240
253
  self.country_added = False
241
254
  self.fit_generated_features: List[str] = []
242
255
  self.fit_dropped_features: Set[str] = set()
@@ -247,7 +260,7 @@ class FeaturesEnricher(TransformerMixin):
247
260
  self.eval_set: Optional[List[Tuple]] = None
248
261
  self.autodetected_search_keys: Dict[str, SearchKey] = {}
249
262
  self.imbalanced = False
250
- self.__cached_sampled_datasets: Optional[Tuple[pd.DataFrame, pd.DataFrame, pd.Series, Dict, Dict]] = None
263
+ self.__cached_sampled_datasets: Optional[Tuple[pd.DataFrame, pd.DataFrame, pd.Series, Dict, Dict, Dict]] = None
251
264
 
252
265
  validate_version(self.logger)
253
266
  self.search_keys = search_keys or {}
@@ -706,7 +719,7 @@ class FeaturesEnricher(TransformerMixin):
706
719
 
707
720
  start_time = time.time()
708
721
  try:
709
- result = self.__inner_transform(
722
+ result, _ = self.__inner_transform(
710
723
  trace_id,
711
724
  X,
712
725
  exclude_features_sources=exclude_features_sources,
@@ -833,17 +846,37 @@ class FeaturesEnricher(TransformerMixin):
833
846
  self.logger.warning(msg)
834
847
  print(msg)
835
848
 
849
+ if X is not None and y is None:
850
+ raise ValidationError("X passed without y")
851
+
836
852
  self.__validate_search_keys(self.search_keys, self.search_id)
837
853
  effective_X = X if X is not None else self.X
838
854
  effective_y = y if y is not None else self.y
839
855
  effective_eval_set = eval_set if eval_set is not None else self.eval_set
840
856
  effective_eval_set = self._check_eval_set(effective_eval_set, effective_X, self.bundle)
841
857
 
858
+ if (
859
+ self._search_task is None
860
+ or self._search_task.provider_metadata_v2 is None
861
+ or len(self._search_task.provider_metadata_v2) == 0
862
+ or effective_X is None
863
+ or effective_y is None
864
+ ):
865
+ raise ValidationError(self.bundle.get("metrics_unfitted_enricher"))
866
+
867
+ validated_X = self._validate_X(effective_X)
868
+ validated_y = self._validate_y(validated_X, effective_y)
869
+ validated_eval_set = (
870
+ [self._validate_eval_set_pair(validated_X, eval_pair) for eval_pair in effective_eval_set]
871
+ if effective_eval_set is not None
872
+ else None
873
+ )
874
+
842
875
  try:
843
876
  self.__log_debug_information(
844
- effective_X,
845
- effective_y,
846
- effective_eval_set,
877
+ validated_X,
878
+ validated_y,
879
+ validated_eval_set,
847
880
  exclude_features_sources=exclude_features_sources,
848
881
  cv=cv if cv is not None else self.cv,
849
882
  importance_threshold=importance_threshold,
@@ -853,21 +886,9 @@ class FeaturesEnricher(TransformerMixin):
853
886
  remove_outliers_calc_metrics=remove_outliers_calc_metrics,
854
887
  )
855
888
 
856
- if (
857
- self._search_task is None
858
- or self._search_task.provider_metadata_v2 is None
859
- or len(self._search_task.provider_metadata_v2) == 0
860
- or effective_X is None
861
- or effective_y is None
862
- ):
863
- raise ValidationError(self.bundle.get("metrics_unfitted_enricher"))
864
-
865
- if X is not None and y is None:
866
- raise ValidationError("X passed without y")
867
-
868
889
  validate_scoring_argument(scoring)
869
890
 
870
- self._validate_baseline_score(effective_X, effective_eval_set)
891
+ self._validate_baseline_score(validated_X, validated_eval_set)
871
892
 
872
893
  if self._has_paid_features(exclude_features_sources):
873
894
  msg = self.bundle.get("metrics_with_paid_features")
@@ -876,7 +897,7 @@ class FeaturesEnricher(TransformerMixin):
876
897
  return None
877
898
 
878
899
  cat_features, search_keys_for_metrics = self._get_client_cat_features(
879
- estimator, effective_X, self.search_keys
900
+ estimator, validated_X, self.search_keys
880
901
  )
881
902
 
882
903
  prepared_data = self._prepare_data_for_metrics(
@@ -906,8 +927,16 @@ class FeaturesEnricher(TransformerMixin):
906
927
  search_keys,
907
928
  groups,
908
929
  _cv,
930
+ columns_renaming,
909
931
  ) = prepared_data
910
932
 
933
+ # rename cat_features
934
+ if cat_features:
935
+ for new_c, old_c in columns_renaming.items():
936
+ if old_c in cat_features:
937
+ cat_features.remove(old_c)
938
+ cat_features.append(new_c)
939
+
911
940
  gc.collect()
912
941
 
913
942
  print(self.bundle.get("metrics_start"))
@@ -920,7 +949,7 @@ class FeaturesEnricher(TransformerMixin):
920
949
 
921
950
  self._check_train_and_eval_target_distribution(y_sorted, fitting_eval_set_dict)
922
951
 
923
- has_date = self._get_date_column(search_keys) is not None
952
+ has_date = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME]) is not None
924
953
  model_task_type = self.model_task_type or define_task(y_sorted, has_date, self.logger, silent=True)
925
954
 
926
955
  wrapper = EstimatorWrapper.create(
@@ -1013,10 +1042,10 @@ class FeaturesEnricher(TransformerMixin):
1013
1042
  self.bundle.get("quality_metrics_rows_header"): _num_samples(effective_X),
1014
1043
  }
1015
1044
  if model_task_type in [ModelTaskType.BINARY, ModelTaskType.REGRESSION] and is_numeric_dtype(
1016
- y_sorted
1045
+ validated_y
1017
1046
  ):
1018
1047
  train_metrics[self.bundle.get("quality_metrics_mean_target_header")] = round(
1019
- np.mean(effective_y), 4
1048
+ np.mean(validated_y), 4
1020
1049
  )
1021
1050
  if etalon_metric is not None:
1022
1051
  train_metrics[self.bundle.get("quality_metrics_baseline_header").format(metric)] = etalon_metric
@@ -1086,10 +1115,10 @@ class FeaturesEnricher(TransformerMixin):
1086
1115
  # self.bundle.get("quality_metrics_match_rate_header"): eval_hit_rate,
1087
1116
  }
1088
1117
  if model_task_type in [ModelTaskType.BINARY, ModelTaskType.REGRESSION] and is_numeric_dtype(
1089
- eval_y_sorted
1118
+ validated_eval_set[idx][1]
1090
1119
  ):
1091
1120
  eval_metrics[self.bundle.get("quality_metrics_mean_target_header")] = round(
1092
- np.mean(effective_eval_set[idx][1]), 4
1121
+ np.mean(validated_eval_set[idx][1]), 4
1093
1122
  )
1094
1123
  if etalon_eval_metric is not None:
1095
1124
  eval_metrics[self.bundle.get("quality_metrics_baseline_header").format(metric)] = (
@@ -1113,7 +1142,7 @@ class FeaturesEnricher(TransformerMixin):
1113
1142
  )
1114
1143
 
1115
1144
  uplift_col = self.bundle.get("quality_metrics_uplift_header")
1116
- date_column = self._get_date_column(search_keys)
1145
+ date_column = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
1117
1146
  if (
1118
1147
  uplift_col in metrics_df.columns
1119
1148
  and (metrics_df[uplift_col] < 0).any()
@@ -1195,27 +1224,6 @@ class FeaturesEnricher(TransformerMixin):
1195
1224
  def _has_paid_features(self, exclude_features_sources: Optional[List[str]]) -> bool:
1196
1225
  return self._has_features_with_commercial_schema(CommercialSchema.PAID.value, exclude_features_sources)
1197
1226
 
1198
- def _extend_x(self, x: pd.DataFrame, is_demo_dataset: bool) -> Tuple[pd.DataFrame, Dict[str, SearchKey]]:
1199
- search_keys = self.search_keys.copy()
1200
- search_keys = self.__prepare_search_keys(x, search_keys, is_demo_dataset, is_transform=True, silent_mode=True)
1201
-
1202
- extended_X = x.copy()
1203
- generated_features = []
1204
- date_column = self._get_date_column(search_keys)
1205
- if date_column is not None:
1206
- converter = DateTimeSearchKeyConverter(date_column, self.date_format, self.logger, self.bundle)
1207
- extended_X = converter.convert(extended_X, keep_time=True)
1208
- generated_features.extend(converter.generated_features)
1209
- email_column = self._get_email_column(search_keys)
1210
- hem_column = self._get_hem_column(search_keys)
1211
- if email_column:
1212
- converter = EmailSearchKeyConverter(email_column, hem_column, search_keys, [], self.logger)
1213
- extended_X = converter.convert(extended_X)
1214
- generated_features.extend(converter.generated_features)
1215
- generated_features = [f for f in generated_features if f in self.fit_generated_features]
1216
-
1217
- return extended_X, search_keys
1218
-
1219
1227
  def _is_input_same_as_fit(
1220
1228
  self,
1221
1229
  X: Union[pd.DataFrame, pd.Series, np.ndarray, None] = None,
@@ -1259,7 +1267,7 @@ class FeaturesEnricher(TransformerMixin):
1259
1267
  groups = None
1260
1268
 
1261
1269
  if not isinstance(_cv, BaseCrossValidator):
1262
- date_column = self._get_date_column(search_keys)
1270
+ date_column = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
1263
1271
  date_series = X[date_column] if date_column is not None else None
1264
1272
  _cv, groups = CVConfig(
1265
1273
  _cv, date_series, self.random_state, self._search_task.get_shuffle_kfold(), group_columns=group_columns
@@ -1282,7 +1290,7 @@ class FeaturesEnricher(TransformerMixin):
1282
1290
 
1283
1291
  def _get_client_cat_features(
1284
1292
  self, estimator: Optional[Any], X: pd.DataFrame, search_keys: Dict[str, SearchKey]
1285
- ) -> Optional[List[str]]:
1293
+ ) -> Tuple[Optional[List[str]], List[str]]:
1286
1294
  cat_features = None
1287
1295
  search_keys_for_metrics = []
1288
1296
  if (
@@ -1342,11 +1350,15 @@ class FeaturesEnricher(TransformerMixin):
1342
1350
  progress_bar,
1343
1351
  progress_callback,
1344
1352
  )
1345
- X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys = dataclasses.astuple(sampled_data)
1353
+ X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys, columns_renaming = dataclasses.astuple(
1354
+ sampled_data
1355
+ )
1346
1356
 
1347
1357
  excluding_search_keys = list(search_keys.keys())
1348
1358
  if search_keys_for_metrics is not None and len(search_keys_for_metrics) > 0:
1349
- excluding_search_keys = [sk for sk in excluding_search_keys if sk not in search_keys_for_metrics]
1359
+ for sk in excluding_search_keys:
1360
+ if columns_renaming.get(sk) in search_keys_for_metrics:
1361
+ excluding_search_keys.remove(sk)
1350
1362
 
1351
1363
  client_features = [
1352
1364
  c
@@ -1363,6 +1375,7 @@ class FeaturesEnricher(TransformerMixin):
1363
1375
  importance_threshold,
1364
1376
  max_features,
1365
1377
  )
1378
+ filtered_enriched_features = [c for c in filtered_enriched_features if c not in client_features]
1366
1379
 
1367
1380
  X_sorted, y_sorted = self._sort_by_system_record_id(X_sampled, y_sampled, self.cv)
1368
1381
  enriched_X_sorted, enriched_y_sorted = self._sort_by_system_record_id(enriched_X, y_sampled, self.cv)
@@ -1392,6 +1405,7 @@ class FeaturesEnricher(TransformerMixin):
1392
1405
  fitting_X = fitting_X.drop(columns=constant_columns, errors="ignore")
1393
1406
  fitting_enriched_X = fitting_enriched_X.drop(columns=constant_columns, errors="ignore")
1394
1407
 
1408
+ # TODO maybe there is no more need for these convertions
1395
1409
  # Remove datetime features
1396
1410
  datetime_features = [
1397
1411
  f for f in fitting_X.columns if is_datetime64_any_dtype(fitting_X[f]) or is_period_dtype(fitting_X[f])
@@ -1479,6 +1493,7 @@ class FeaturesEnricher(TransformerMixin):
1479
1493
  search_keys,
1480
1494
  groups,
1481
1495
  cv,
1496
+ columns_renaming,
1482
1497
  )
1483
1498
 
1484
1499
  @dataclass
@@ -1488,6 +1503,7 @@ class FeaturesEnricher(TransformerMixin):
1488
1503
  enriched_X: pd.DataFrame
1489
1504
  eval_set_sampled_dict: Dict[int, Tuple[pd.DataFrame, pd.Series]]
1490
1505
  search_keys: Dict[str, SearchKey]
1506
+ columns_renaming: Dict[str, str]
1491
1507
 
1492
1508
  def _sample_data_for_metrics(
1493
1509
  self,
@@ -1527,11 +1543,15 @@ class FeaturesEnricher(TransformerMixin):
1527
1543
  )
1528
1544
 
1529
1545
  def __get_sampled_cached_enriched(self, exclude_features_sources: Optional[List[str]]) -> _SampledDataForMetrics:
1530
- X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys = self.__cached_sampled_datasets
1546
+ X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys, columns_renaming = (
1547
+ self.__cached_sampled_datasets
1548
+ )
1531
1549
  if exclude_features_sources:
1532
1550
  enriched_X = enriched_X.drop(columns=exclude_features_sources, errors="ignore")
1533
1551
 
1534
- return self.__mk_sampled_data_tuple(X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys)
1552
+ return self.__mk_sampled_data_tuple(
1553
+ X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys, columns_renaming
1554
+ )
1535
1555
 
1536
1556
  def __sample_only_input(
1537
1557
  self, validated_X: pd.DataFrame, validated_y: pd.Series, eval_set: Optional[List[tuple]], is_demo_dataset: bool
@@ -1549,6 +1569,28 @@ class FeaturesEnricher(TransformerMixin):
1549
1569
  eval_xy[EVAL_SET_INDEX] = idx + 1
1550
1570
  df = pd.concat([df, eval_xy])
1551
1571
 
1572
+ search_keys = self.search_keys.copy()
1573
+ search_keys = self.__prepare_search_keys(df, search_keys, is_demo_dataset, is_transform=True, silent_mode=True)
1574
+
1575
+ date_column = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
1576
+ generated_features = []
1577
+ if date_column is not None:
1578
+ converter = DateTimeSearchKeyConverter(
1579
+ date_column, self.date_format, self.logger, self.bundle, silent_mode=True
1580
+ )
1581
+ df = converter.convert(df, keep_time=True)
1582
+ generated_features = converter.generated_features
1583
+
1584
+ email_columns = SearchKey.find_all_keys(search_keys, SearchKey.EMAIL)
1585
+ if email_columns:
1586
+ generator = EmailDomainGenerator(email_columns)
1587
+ df = generator.generate(df)
1588
+ generated_features.extend(generator.generated_features)
1589
+
1590
+ normalizer = Normalizer(self.search_keys, generated_features, self.bundle, self.logger, self.warning_counter)
1591
+ df = normalizer.normalize(df)
1592
+ columns_renaming = normalizer.columns_renaming
1593
+
1552
1594
  df = clean_full_duplicates(df, logger=self.logger, silent=True, bundle=self.bundle)
1553
1595
 
1554
1596
  num_samples = _num_samples(df)
@@ -1561,24 +1603,34 @@ class FeaturesEnricher(TransformerMixin):
1561
1603
  self.logger.info(f"Downsampling from {num_samples} to {sample_rows}")
1562
1604
  df = df.sample(n=sample_rows, random_state=self.random_state)
1563
1605
 
1564
- df_extended, search_keys = self._extend_x(df, is_demo_dataset)
1565
- df_extended = self.__add_fit_system_record_id(df_extended, {}, search_keys, SYSTEM_RECORD_ID)
1606
+ df = self.__add_fit_system_record_id(df, search_keys, SYSTEM_RECORD_ID)
1607
+ if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
1608
+ df = df.drop(columns=DateTimeSearchKeyConverter.DATETIME_COL)
1566
1609
 
1567
- train_df = df_extended.query(f"{EVAL_SET_INDEX} == 0") if eval_set is not None else df_extended
1610
+ train_df = df.query(f"{EVAL_SET_INDEX} == 0") if eval_set is not None else df
1568
1611
  X_sampled = train_df.drop(columns=[TARGET, EVAL_SET_INDEX], errors="ignore")
1569
1612
  y_sampled = train_df[TARGET].copy()
1570
1613
  enriched_X = X_sampled
1571
1614
 
1572
1615
  if eval_set is not None:
1573
1616
  for idx in range(len(eval_set)):
1574
- eval_xy_sampled = df_extended.query(f"{EVAL_SET_INDEX} == {idx + 1}")
1617
+ eval_xy_sampled = df.query(f"{EVAL_SET_INDEX} == {idx + 1}")
1575
1618
  eval_X_sampled = eval_xy_sampled.drop(columns=[TARGET, EVAL_SET_INDEX], errors="ignore")
1576
1619
  eval_y_sampled = eval_xy_sampled[TARGET].copy()
1577
1620
  enriched_eval_X = eval_X_sampled
1578
1621
  eval_set_sampled_dict[idx] = (eval_X_sampled, enriched_eval_X, eval_y_sampled)
1579
- self.__cached_sampled_datasets = (X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys)
1622
+ self.__cached_sampled_datasets = (
1623
+ X_sampled,
1624
+ y_sampled,
1625
+ enriched_X,
1626
+ eval_set_sampled_dict,
1627
+ search_keys,
1628
+ columns_renaming,
1629
+ )
1580
1630
 
1581
- return self.__mk_sampled_data_tuple(X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys)
1631
+ return self.__mk_sampled_data_tuple(
1632
+ X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys, columns_renaming
1633
+ )
1582
1634
 
1583
1635
  def __sample_balanced(
1584
1636
  self,
@@ -1590,7 +1642,7 @@ class FeaturesEnricher(TransformerMixin):
1590
1642
  search_keys = self.fit_search_keys
1591
1643
 
1592
1644
  rows_to_drop = None
1593
- has_date = self._get_date_column(search_keys) is not None
1645
+ has_date = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME]) is not None
1594
1646
  task_type = self.model_task_type or define_task(
1595
1647
  self.df_with_original_index[TARGET], has_date, self.logger, silent=True
1596
1648
  )
@@ -1644,9 +1696,18 @@ class FeaturesEnricher(TransformerMixin):
1644
1696
  enriched_eval_X = enriched_eval_sets[idx + 1].drop(columns=[TARGET, EVAL_SET_INDEX])
1645
1697
  eval_set_sampled_dict[idx] = (eval_X_sampled, enriched_eval_X, eval_y_sampled)
1646
1698
 
1647
- self.__cached_sampled_datasets = (X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys)
1699
+ self.__cached_sampled_datasets = (
1700
+ X_sampled,
1701
+ y_sampled,
1702
+ enriched_X,
1703
+ eval_set_sampled_dict,
1704
+ search_keys,
1705
+ self.fit_columns_renaming,
1706
+ )
1648
1707
 
1649
- return self.__mk_sampled_data_tuple(X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys)
1708
+ return self.__mk_sampled_data_tuple(
1709
+ X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys, self.fit_columns_renaming
1710
+ )
1650
1711
 
1651
1712
  def __sample_imbalanced(
1652
1713
  self,
@@ -1686,7 +1747,7 @@ class FeaturesEnricher(TransformerMixin):
1686
1747
  tmp_target_name = "__target"
1687
1748
  df = df.rename(columns={TARGET: tmp_target_name})
1688
1749
 
1689
- enriched_df = self.__inner_transform(
1750
+ enriched_df, columns_renaming = self.__inner_transform(
1690
1751
  trace_id,
1691
1752
  df,
1692
1753
  exclude_features_sources=exclude_features_sources,
@@ -1734,7 +1795,7 @@ class FeaturesEnricher(TransformerMixin):
1734
1795
  tmp_target_name = "__target"
1735
1796
  df = df.rename(columns={TARGET: tmp_target_name})
1736
1797
 
1737
- enriched_Xy = self.__inner_transform(
1798
+ enriched_Xy, columns_renaming = self.__inner_transform(
1738
1799
  trace_id,
1739
1800
  df,
1740
1801
  exclude_features_sources=exclude_features_sources,
@@ -1759,9 +1820,18 @@ class FeaturesEnricher(TransformerMixin):
1759
1820
  y_sampled = enriched_Xy[TARGET].copy()
1760
1821
  enriched_X = enriched_Xy.drop(columns=TARGET)
1761
1822
 
1762
- self.__cached_sampled_datasets = (X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, self.search_keys)
1823
+ self.__cached_sampled_datasets = (
1824
+ X_sampled,
1825
+ y_sampled,
1826
+ enriched_X,
1827
+ eval_set_sampled_dict,
1828
+ self.search_keys,
1829
+ columns_renaming,
1830
+ )
1763
1831
 
1764
- return self.__mk_sampled_data_tuple(X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, self.search_keys)
1832
+ return self.__mk_sampled_data_tuple(
1833
+ X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, self.search_keys, columns_renaming
1834
+ )
1765
1835
 
1766
1836
  def __mk_sampled_data_tuple(
1767
1837
  self,
@@ -1770,6 +1840,7 @@ class FeaturesEnricher(TransformerMixin):
1770
1840
  enriched_X: pd.DataFrame,
1771
1841
  eval_set_sampled_dict: Dict,
1772
1842
  search_keys: Dict,
1843
+ columns_renaming: Dict[str, str],
1773
1844
  ):
1774
1845
  search_keys = {k: v for k, v in search_keys.items() if k in X_sampled.columns.to_list()}
1775
1846
  return FeaturesEnricher._SampledDataForMetrics(
@@ -1778,6 +1849,7 @@ class FeaturesEnricher(TransformerMixin):
1778
1849
  enriched_X=enriched_X,
1779
1850
  eval_set_sampled_dict=eval_set_sampled_dict,
1780
1851
  search_keys=search_keys,
1852
+ columns_renaming=columns_renaming,
1781
1853
  )
1782
1854
 
1783
1855
  def get_search_id(self) -> Optional[str]:
@@ -1866,7 +1938,7 @@ class FeaturesEnricher(TransformerMixin):
1866
1938
  progress_bar: Optional[ProgressBar] = None,
1867
1939
  progress_callback: Optional[Callable[[SearchProgress], Any]] = None,
1868
1940
  add_fit_system_record_id: bool = False,
1869
- ) -> pd.DataFrame:
1941
+ ) -> Tuple[pd.DataFrame, Dict[str, str]]:
1870
1942
  if self._search_task is None:
1871
1943
  raise NotFittedError(self.bundle.get("transform_unfitted_enricher"))
1872
1944
 
@@ -1879,13 +1951,13 @@ class FeaturesEnricher(TransformerMixin):
1879
1951
 
1880
1952
  if len(self.feature_names_) == 0:
1881
1953
  self.logger.warning(self.bundle.get("no_important_features_for_transform"))
1882
- return X
1954
+ return X, {c: c for c in X.columns}
1883
1955
 
1884
1956
  if self._has_paid_features(exclude_features_sources):
1885
1957
  msg = self.bundle.get("transform_with_paid_features")
1886
1958
  self.logger.warning(msg)
1887
1959
  self.__display_support_link(msg)
1888
- return None
1960
+ return None, {c: c for c in X.columns}
1889
1961
 
1890
1962
  if not metrics_calculation:
1891
1963
  transform_usage = self.rest_client.get_current_transform_usage(trace_id)
@@ -1896,7 +1968,7 @@ class FeaturesEnricher(TransformerMixin):
1896
1968
  self.logger.warning(msg)
1897
1969
  print(msg)
1898
1970
  show_request_quote_button()
1899
- return None
1971
+ return None, {c: c for c in X.columns}
1900
1972
  else:
1901
1973
  msg = self.bundle.get("transform_usage_info").format(
1902
1974
  transform_usage.limit, transform_usage.transformed_rows
@@ -1934,9 +2006,11 @@ class FeaturesEnricher(TransformerMixin):
1934
2006
  df = self.__add_country_code(df, search_keys)
1935
2007
 
1936
2008
  generated_features = []
1937
- date_column = self._get_date_column(search_keys)
2009
+ date_column = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
1938
2010
  if date_column is not None:
1939
- converter = DateTimeSearchKeyConverter(date_column, self.date_format, self.logger, bundle=self.bundle)
2011
+ converter = DateTimeSearchKeyConverter(
2012
+ date_column, self.date_format, self.logger, bundle=self.bundle, silent_mode=silent_mode
2013
+ )
1940
2014
  df = converter.convert(df)
1941
2015
  self.logger.info(f"Date column after convertion: {df[date_column]}")
1942
2016
  generated_features.extend(converter.generated_features)
@@ -1945,61 +2019,93 @@ class FeaturesEnricher(TransformerMixin):
1945
2019
  if self.add_date_if_missing:
1946
2020
  df = self._add_current_date_as_key(df, search_keys, self.logger, self.bundle)
1947
2021
 
2022
+ email_columns = SearchKey.find_all_keys(search_keys, SearchKey.EMAIL)
2023
+ if email_columns:
2024
+ generator = EmailDomainGenerator(email_columns)
2025
+ df = generator.generate(df)
2026
+ generated_features.extend(generator.generated_features)
2027
+
2028
+ normalizer = Normalizer(
2029
+ search_keys, generated_features, self.bundle, self.logger, self.warning_counter, silent_mode
2030
+ )
2031
+ df = normalizer.normalize(df)
2032
+ columns_renaming = normalizer.columns_renaming
2033
+
1948
2034
  # Don't pass all features in backend on transform
1949
- original_features_for_transform = []
1950
2035
  runtime_parameters = self._get_copy_of_runtime_parameters()
1951
- features_not_to_pass = [column for column in df.columns if column not in search_keys.keys()]
1952
- if len(features_not_to_pass) > 0:
1953
- # Pass only features that need for transform
1954
- features_for_transform = self._search_task.get_features_for_transform()
1955
- if features_for_transform is not None and len(features_for_transform) > 0:
1956
- file_metadata = self._search_task.get_file_metadata(trace_id)
1957
- original_features_for_transform = [
1958
- c.originalName or c.name for c in file_metadata.columns if c.name in features_for_transform
1959
- ]
2036
+ features_for_transform = self._search_task.get_features_for_transform() or []
2037
+ if len(features_for_transform) > 0:
2038
+ runtime_parameters.properties["features_for_embeddings"] = ",".join(features_for_transform)
1960
2039
 
1961
- runtime_parameters.properties["features_for_embeddings"] = ",".join(features_for_transform)
1962
-
1963
- columns_for_system_record_id = sorted(list(search_keys.keys()) + (original_features_for_transform))
2040
+ columns_for_system_record_id = sorted(list(search_keys.keys()) + features_for_transform)
1964
2041
 
1965
2042
  df[ENTITY_SYSTEM_RECORD_ID] = pd.util.hash_pandas_object(
1966
2043
  df[columns_for_system_record_id], index=False
1967
2044
  ).astype("Float64")
1968
2045
 
1969
2046
  # Explode multiple search keys
1970
- df, unnest_search_keys = self._explode_multiple_search_keys(df, search_keys)
2047
+ df, unnest_search_keys = self._explode_multiple_search_keys(df, search_keys, columns_renaming)
1971
2048
 
1972
2049
  email_column = self._get_email_column(search_keys)
1973
2050
  hem_column = self._get_hem_column(search_keys)
1974
- email_converted_to_hem = False
1975
2051
  if email_column:
1976
2052
  converter = EmailSearchKeyConverter(
1977
- email_column, hem_column, search_keys, list(unnest_search_keys.keys()), self.logger
2053
+ email_column,
2054
+ hem_column,
2055
+ search_keys,
2056
+ columns_renaming,
2057
+ list(unnest_search_keys.keys()),
2058
+ self.logger,
1978
2059
  )
1979
2060
  df = converter.convert(df)
1980
- generated_features.extend(converter.generated_features)
1981
- email_converted_to_hem = converter.email_converted_to_hem
2061
+
2062
+ ip_column = self._get_ip_column(search_keys)
2063
+ if ip_column:
2064
+ converter = IpSearchKeyConverter(
2065
+ ip_column,
2066
+ search_keys,
2067
+ columns_renaming,
2068
+ list(unnest_search_keys.keys()),
2069
+ self.bundle,
2070
+ self.logger,
2071
+ )
2072
+ df = converter.convert(df)
2073
+
2074
+ phone_column = self._get_phone_column(search_keys)
2075
+ country_column = self._get_country_column(search_keys)
2076
+ if phone_column:
2077
+ converter = PhoneSearchKeyConverter(phone_column, country_column)
2078
+ df = converter.convert(df)
2079
+
2080
+ if country_column:
2081
+ converter = CountrySearchKeyConverter(country_column)
2082
+ df = converter.convert(df)
2083
+
2084
+ postal_code = self._get_postal_column(search_keys)
2085
+ if postal_code:
2086
+ converter = PostalCodeSearchKeyConverter(postal_code)
2087
+ df = converter.convert(df)
2088
+
1982
2089
  generated_features = [f for f in generated_features if f in self.fit_generated_features]
1983
2090
 
1984
2091
  meaning_types = {col: key.value for col, key in search_keys.items()}
1985
- # non_keys_columns = [column for column in df.columns if column not in search_keys.keys()]
1986
- for col in original_features_for_transform:
2092
+ for col in features_for_transform:
1987
2093
  meaning_types[col] = FileColumnMeaningType.FEATURE
1988
- features_not_to_pass = [column for column in features_not_to_pass if column not in search_keys.keys()]
1989
-
1990
- if email_converted_to_hem:
1991
- features_not_to_pass.append(email_column)
1992
-
1993
- features_not_to_pass = [c for c in features_not_to_pass if c not in original_features_for_transform]
1994
- columns_for_system_record_id = sorted(list(search_keys.keys()) + (original_features_for_transform))
2094
+ features_not_to_pass = [
2095
+ c
2096
+ for c in df.columns
2097
+ if c not in search_keys.keys() and c not in features_for_transform and c != ENTITY_SYSTEM_RECORD_ID
2098
+ ]
1995
2099
 
1996
2100
  if add_fit_system_record_id:
1997
- df = self.__add_fit_system_record_id(df, {}, search_keys, SYSTEM_RECORD_ID)
2101
+ df = self.__add_fit_system_record_id(df, search_keys, SYSTEM_RECORD_ID)
2102
+ if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
2103
+ df = df.drop(columns=DateTimeSearchKeyConverter.DATETIME_COL)
1998
2104
  df = df.rename(columns={SYSTEM_RECORD_ID: SORT_ID})
1999
2105
  features_not_to_pass.append(SORT_ID)
2000
2106
 
2001
- columns_for_system_record_id = sorted(list(search_keys.keys()) + (original_features_for_transform or []))
2002
-
2107
+ # search keys might be changed after explode
2108
+ columns_for_system_record_id = sorted(list(search_keys.keys()) + features_for_transform)
2003
2109
  df[SYSTEM_RECORD_ID] = pd.util.hash_pandas_object(df[columns_for_system_record_id], index=False).astype(
2004
2110
  "Float64"
2005
2111
  )
@@ -2035,8 +2141,7 @@ class FeaturesEnricher(TransformerMixin):
2035
2141
  rest_client=self.rest_client,
2036
2142
  logger=self.logger,
2037
2143
  )
2038
- if email_converted_to_hem:
2039
- dataset.ignore_columns = [email_column]
2144
+ dataset.columns_renaming = columns_renaming
2040
2145
 
2041
2146
  if max_features is not None or importance_threshold is not None:
2042
2147
  exclude_features_sources = list(
@@ -2125,7 +2230,9 @@ class FeaturesEnricher(TransformerMixin):
2125
2230
  result = enrich()
2126
2231
 
2127
2232
  filtered_columns = self.__filtered_enriched_features(importance_threshold, max_features)
2128
- existing_filtered_columns = [c for c in filtered_columns if c in result.columns]
2233
+ existing_filtered_columns = [
2234
+ c for c in filtered_columns if c in result.columns and c not in validated_X.columns
2235
+ ]
2129
2236
  selecting_columns = validated_X.columns.tolist() + generated_features + existing_filtered_columns
2130
2237
  if add_fit_system_record_id:
2131
2238
  selecting_columns.append(SORT_ID)
@@ -2138,7 +2245,7 @@ class FeaturesEnricher(TransformerMixin):
2138
2245
  if add_fit_system_record_id:
2139
2246
  result = result.rename(columns={SORT_ID: SYSTEM_RECORD_ID})
2140
2247
 
2141
- return result
2248
+ return result, columns_renaming
2142
2249
 
2143
2250
  def _get_excluded_features(self, max_features: Optional[int], importance_threshold: Optional[float]) -> List[str]:
2144
2251
  features_info = self._internal_features_info
@@ -2239,6 +2346,9 @@ class FeaturesEnricher(TransformerMixin):
2239
2346
  self.df_with_original_index = None
2240
2347
  self.__cached_sampled_datasets = None
2241
2348
  self.metrics = None
2349
+ self.fit_columns_renaming = None
2350
+ self.fit_dropped_features = set()
2351
+ self.fit_generated_features = []
2242
2352
 
2243
2353
  validated_X = self._validate_X(X)
2244
2354
  validated_y = self._validate_y(validated_X, y)
@@ -2285,9 +2395,10 @@ class FeaturesEnricher(TransformerMixin):
2285
2395
  self.fit_search_keys = self.search_keys.copy()
2286
2396
  self.fit_search_keys = self.__prepare_search_keys(validated_X, self.fit_search_keys, is_demo_dataset)
2287
2397
 
2288
- maybe_date_column = self._get_date_column(self.fit_search_keys)
2398
+ maybe_date_column = SearchKey.find_key(self.fit_search_keys, [SearchKey.DATE, SearchKey.DATETIME])
2289
2399
  has_date = maybe_date_column is not None
2290
2400
  model_task_type = self.model_task_type or define_task(validated_y, has_date, self.logger)
2401
+
2291
2402
  self._validate_binary_observations(validated_y, model_task_type)
2292
2403
 
2293
2404
  self.runtime_parameters = get_runtime_params_custom_loss(
@@ -2317,7 +2428,13 @@ class FeaturesEnricher(TransformerMixin):
2317
2428
  self.fit_generated_features = []
2318
2429
 
2319
2430
  if has_date:
2320
- converter = DateTimeSearchKeyConverter(maybe_date_column, self.date_format, self.logger, bundle=self.bundle)
2431
+ converter = DateTimeSearchKeyConverter(
2432
+ maybe_date_column,
2433
+ self.date_format,
2434
+ self.logger,
2435
+ bundle=self.bundle,
2436
+ warnings_counter=self.warning_counter,
2437
+ )
2321
2438
  df = converter.convert(df, keep_time=True)
2322
2439
  self.logger.info(f"Date column after convertion: {df[maybe_date_column]}")
2323
2440
  self.fit_generated_features.extend(converter.generated_features)
@@ -2326,6 +2443,12 @@ class FeaturesEnricher(TransformerMixin):
2326
2443
  if self.add_date_if_missing:
2327
2444
  df = self._add_current_date_as_key(df, self.fit_search_keys, self.logger, self.bundle)
2328
2445
 
2446
+ email_columns = SearchKey.find_all_keys(self.fit_search_keys, SearchKey.EMAIL)
2447
+ if email_columns:
2448
+ generator = EmailDomainGenerator(email_columns)
2449
+ df = generator.generate(df)
2450
+ self.fit_generated_features.extend(generator.generated_features)
2451
+
2329
2452
  # Checks that need validated date
2330
2453
  validate_dates_distribution(df, self.fit_search_keys, self.logger, self.bundle, self.warning_counter)
2331
2454
 
@@ -2334,7 +2457,12 @@ class FeaturesEnricher(TransformerMixin):
2334
2457
 
2335
2458
  self.__adjust_cv(df, maybe_date_column, model_task_type)
2336
2459
 
2337
- # TODO normalize and convert all columns
2460
+ normalizer = Normalizer(
2461
+ self.fit_search_keys, self.fit_generated_features, self.bundle, self.logger, self.warning_counter
2462
+ )
2463
+ df = normalizer.normalize(df)
2464
+ columns_renaming = normalizer.columns_renaming
2465
+ self.fit_columns_renaming = columns_renaming
2338
2466
 
2339
2467
  df = remove_fintech_duplicates(
2340
2468
  df, self.fit_search_keys, date_format=self.date_format, logger=self.logger, bundle=self.bundle
@@ -2342,38 +2470,58 @@ class FeaturesEnricher(TransformerMixin):
2342
2470
  df = clean_full_duplicates(df, self.logger, bundle=self.bundle)
2343
2471
 
2344
2472
  # Explode multiple search keys
2345
- non_feature_columns = [self.TARGET_NAME, EVAL_SET_INDEX] + list(self.fit_search_keys.keys())
2346
- meaning_types = {
2347
- **{col: key.value for col, key in self.fit_search_keys.items()},
2348
- **{str(c): FileColumnMeaningType.FEATURE for c in df.columns if c not in non_feature_columns},
2349
- }
2350
- meaning_types[self.TARGET_NAME] = FileColumnMeaningType.TARGET
2351
- if eval_set is not None and len(eval_set) > 0:
2352
- meaning_types[EVAL_SET_INDEX] = FileColumnMeaningType.EVAL_SET_INDEX
2353
- df = self.__add_fit_system_record_id(df, meaning_types, self.fit_search_keys, ENTITY_SYSTEM_RECORD_ID)
2473
+ df = self.__add_fit_system_record_id(df, self.fit_search_keys, ENTITY_SYSTEM_RECORD_ID)
2354
2474
 
2355
2475
  # TODO check that this is correct for enrichment
2356
2476
  self.df_with_original_index = df.copy()
2477
+ # TODO check maybe need to drop _time column from df_with_original_index
2357
2478
 
2358
- df, unnest_search_keys = self._explode_multiple_search_keys(df, self.fit_search_keys)
2479
+ df, unnest_search_keys = self._explode_multiple_search_keys(df, self.fit_search_keys, columns_renaming)
2359
2480
 
2360
2481
  # Convert EMAIL to HEM after unnesting to do it only with one column
2361
2482
  email_column = self._get_email_column(self.fit_search_keys)
2362
2483
  hem_column = self._get_hem_column(self.fit_search_keys)
2363
- email_converted_to_hem = False
2364
2484
  if email_column:
2365
2485
  converter = EmailSearchKeyConverter(
2366
- email_column, hem_column, self.fit_search_keys, list(unnest_search_keys.keys()), self.logger
2486
+ email_column,
2487
+ hem_column,
2488
+ self.fit_search_keys,
2489
+ columns_renaming,
2490
+ list(unnest_search_keys.keys()),
2491
+ self.logger,
2367
2492
  )
2368
2493
  df = converter.convert(df)
2369
- self.fit_generated_features.extend(converter.generated_features)
2370
- email_converted_to_hem = converter.email_converted_to_hem
2494
+
2495
+ ip_column = self._get_ip_column(self.fit_search_keys)
2496
+ if ip_column:
2497
+ converter = IpSearchKeyConverter(
2498
+ ip_column,
2499
+ self.fit_search_keys,
2500
+ columns_renaming,
2501
+ list(unnest_search_keys.keys()),
2502
+ self.bundle,
2503
+ self.logger,
2504
+ )
2505
+ df = converter.convert(df)
2506
+
2507
+ phone_column = self._get_phone_column(self.fit_search_keys)
2508
+ country_column = self._get_country_column(self.fit_search_keys)
2509
+ if phone_column:
2510
+ converter = PhoneSearchKeyConverter(phone_column, country_column)
2511
+ df = converter.convert(df)
2512
+
2513
+ if country_column:
2514
+ converter = CountrySearchKeyConverter(country_column)
2515
+ df = converter.convert(df)
2516
+
2517
+ postal_code = self._get_postal_column(self.fit_search_keys)
2518
+ if postal_code:
2519
+ converter = PostalCodeSearchKeyConverter(postal_code)
2520
+ df = converter.convert(df)
2371
2521
 
2372
2522
  non_feature_columns = [self.TARGET_NAME, EVAL_SET_INDEX, ENTITY_SYSTEM_RECORD_ID, SEARCH_KEY_UNNEST] + list(
2373
2523
  self.fit_search_keys.keys()
2374
2524
  )
2375
- if email_converted_to_hem:
2376
- non_feature_columns.append(email_column)
2377
2525
  if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
2378
2526
  non_feature_columns.append(DateTimeSearchKeyConverter.DATETIME_COL)
2379
2527
 
@@ -2385,9 +2533,6 @@ class FeaturesEnricher(TransformerMixin):
2385
2533
  self.fit_dropped_features.update(features_to_drop)
2386
2534
  df = df.drop(columns=features_to_drop)
2387
2535
 
2388
- if email_converted_to_hem:
2389
- self.fit_dropped_features.add(email_column)
2390
-
2391
2536
  self.fit_generated_features = [f for f in self.fit_generated_features if f not in self.fit_dropped_features]
2392
2537
 
2393
2538
  meaning_types = {
@@ -2401,7 +2546,12 @@ class FeaturesEnricher(TransformerMixin):
2401
2546
  if eval_set is not None and len(eval_set) > 0:
2402
2547
  meaning_types[EVAL_SET_INDEX] = FileColumnMeaningType.EVAL_SET_INDEX
2403
2548
 
2404
- df = self.__add_fit_system_record_id(df, meaning_types, self.fit_search_keys, SYSTEM_RECORD_ID)
2549
+ df = self.__add_fit_system_record_id(df, self.fit_search_keys, SYSTEM_RECORD_ID)
2550
+
2551
+ if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
2552
+ df = df.drop(columns=DateTimeSearchKeyConverter.DATETIME_COL)
2553
+
2554
+ meaning_types[SYSTEM_RECORD_ID] = FileColumnMeaningType.SYSTEM_RECORD_ID
2405
2555
 
2406
2556
  df = df.reset_index(drop=True).sort_values(by=SYSTEM_RECORD_ID).reset_index(drop=True)
2407
2557
 
@@ -2419,8 +2569,7 @@ class FeaturesEnricher(TransformerMixin):
2419
2569
  rest_client=self.rest_client,
2420
2570
  logger=self.logger,
2421
2571
  )
2422
- if email_converted_to_hem:
2423
- dataset.ignore_columns = [email_column]
2572
+ dataset.columns_renaming = columns_renaming
2424
2573
 
2425
2574
  self.passed_features = [
2426
2575
  column for column, meaning_type in meaning_types.items() if meaning_type == FileColumnMeaningType.FEATURE
@@ -2809,7 +2958,7 @@ class FeaturesEnricher(TransformerMixin):
2809
2958
  if DateTimeSearchKeyConverter.DATETIME_COL in X.columns:
2810
2959
  date_column = DateTimeSearchKeyConverter.DATETIME_COL
2811
2960
  else:
2812
- date_column = FeaturesEnricher._get_date_column(search_keys)
2961
+ date_column = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
2813
2962
  sort_columns = [date_column] if date_column is not None else []
2814
2963
 
2815
2964
  # Xy = pd.concat([X, y], axis=1)
@@ -2905,10 +3054,10 @@ class FeaturesEnricher(TransformerMixin):
2905
3054
 
2906
3055
  do_without_pandas_limits(print_datasets_sample)
2907
3056
 
2908
- maybe_date_col = self._get_date_column(self.search_keys)
3057
+ maybe_date_col = SearchKey.find_key(self.search_keys, [SearchKey.DATE, SearchKey.DATETIME])
2909
3058
  if X is not None and maybe_date_col is not None and maybe_date_col in X.columns:
2910
3059
  # TODO cast date column to single dtype
2911
- date_converter = DateTimeSearchKeyConverter(maybe_date_col, self.date_format)
3060
+ date_converter = DateTimeSearchKeyConverter(maybe_date_col, self.date_format, silent_mode=True)
2912
3061
  converted_X = date_converter.convert(X)
2913
3062
  min_date = converted_X[maybe_date_col].min()
2914
3063
  max_date = converted_X[maybe_date_col].max()
@@ -2935,12 +3084,6 @@ class FeaturesEnricher(TransformerMixin):
2935
3084
 
2936
3085
  return df
2937
3086
 
2938
- @staticmethod
2939
- def _get_date_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
2940
- for col, t in search_keys.items():
2941
- if t in [SearchKey.DATE, SearchKey.DATETIME]:
2942
- return col
2943
-
2944
3087
  @staticmethod
2945
3088
  def _add_current_date_as_key(
2946
3089
  df: pd.DataFrame, search_keys: Dict[str, SearchKey], logger: logging.Logger, bundle: ResourceBundle
@@ -2956,7 +3099,7 @@ class FeaturesEnricher(TransformerMixin):
2956
3099
  logger.warning(msg)
2957
3100
  df[FeaturesEnricher.CURRENT_DATE] = datetime.date.today()
2958
3101
  search_keys[FeaturesEnricher.CURRENT_DATE] = SearchKey.DATE
2959
- converter = DateTimeSearchKeyConverter(FeaturesEnricher.CURRENT_DATE, None, logger, bundle)
3102
+ converter = DateTimeSearchKeyConverter(FeaturesEnricher.CURRENT_DATE, silent_mode=True)
2960
3103
  df = converter.convert(df)
2961
3104
  return df
2962
3105
 
@@ -2984,17 +3127,37 @@ class FeaturesEnricher(TransformerMixin):
2984
3127
  if len(cols) == 1:
2985
3128
  return cols[0]
2986
3129
 
3130
+ @staticmethod
3131
+ def _get_ip_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
3132
+ cols = [col for col, t in search_keys.items() if t == SearchKey.IP]
3133
+ if len(cols) > 1:
3134
+ raise Exception("More than one ip column found after unnest")
3135
+ if len(cols) == 1:
3136
+ return cols[0]
3137
+
2987
3138
  @staticmethod
2988
3139
  def _get_phone_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
2989
3140
  for col, t in search_keys.items():
2990
3141
  if t == SearchKey.PHONE:
2991
3142
  return col
2992
3143
 
3144
+ @staticmethod
3145
+ def _get_country_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
3146
+ for col, t in search_keys.items():
3147
+ if t == SearchKey.COUNTRY:
3148
+ return col
3149
+
3150
+ @staticmethod
3151
+ def _get_postal_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
3152
+ for col, t in search_keys.items():
3153
+ if t == SearchKey.POSTAL_CODE:
3154
+ return col
3155
+
2993
3156
  def _explode_multiple_search_keys(
2994
- self, df: pd.DataFrame, search_keys: Dict[str, SearchKey]
3157
+ self, df: pd.DataFrame, search_keys: Dict[str, SearchKey], columns_renaming: Dict[str, str]
2995
3158
  ) -> Tuple[pd.DataFrame, Dict[str, List[str]]]:
2996
3159
  # find groups of multiple search keys
2997
- search_key_names_by_type: Dict[SearchKey, str] = {}
3160
+ search_key_names_by_type: Dict[SearchKey, List[str]] = {}
2998
3161
  for key_name, key_type in search_keys.items():
2999
3162
  search_key_names_by_type[key_type] = search_key_names_by_type.get(key_type, []) + [key_name]
3000
3163
  search_key_names_by_type = {
@@ -3003,6 +3166,7 @@ class FeaturesEnricher(TransformerMixin):
3003
3166
  if len(search_key_names_by_type) == 0:
3004
3167
  return df, {}
3005
3168
 
3169
+ self.logger.info(f"Start exploding dataset by {search_key_names_by_type}. Size before: {len(df)}")
3006
3170
  multiple_keys_columns = [col for cols in search_key_names_by_type.values() for col in cols]
3007
3171
  other_columns = [col for col in df.columns if col not in multiple_keys_columns]
3008
3172
  exploded_dfs = []
@@ -3018,14 +3182,16 @@ class FeaturesEnricher(TransformerMixin):
3018
3182
  del search_keys[old_key]
3019
3183
  search_keys[new_search_key] = key_type
3020
3184
  unnest_search_keys[new_search_key] = key_names
3185
+ columns_renaming[new_search_key] = new_search_key
3021
3186
 
3022
3187
  df = pd.concat(exploded_dfs, ignore_index=True)
3188
+ self.logger.info(f"Finished explosion. Size after: {len(df)}")
3023
3189
  return df, unnest_search_keys
3024
3190
 
3025
3191
  def __add_fit_system_record_id(
3026
3192
  self,
3027
3193
  df: pd.DataFrame,
3028
- meaning_types: Dict[str, FileColumnMeaningType],
3194
+ # meaning_types: Dict[str, FileColumnMeaningType],
3029
3195
  search_keys: Dict[str, SearchKey],
3030
3196
  id_name: str,
3031
3197
  ) -> pd.DataFrame:
@@ -3048,39 +3214,37 @@ class FeaturesEnricher(TransformerMixin):
3048
3214
  ]
3049
3215
  if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
3050
3216
  date_column = DateTimeSearchKeyConverter.DATETIME_COL
3051
- sort_exclude_columns.append(self._get_date_column(search_keys))
3217
+ sort_exclude_columns.append(SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME]))
3052
3218
  else:
3053
- date_column = self._get_date_column(search_keys)
3219
+ date_column = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
3054
3220
  sort_columns = [date_column] if date_column is not None else []
3055
3221
 
3222
+ sorted_other_keys = sorted(search_keys, key=lambda x: str(search_keys.get(x)))
3223
+ sorted_other_keys = [k for k in sorted_other_keys if k not in sort_exclude_columns]
3224
+
3056
3225
  other_columns = sorted(
3057
3226
  [
3058
3227
  c
3059
3228
  for c in df.columns
3060
- if c not in sort_columns and c not in sort_exclude_columns and df[c].nunique() > 1
3229
+ if c not in sort_columns
3230
+ and c not in sorted_other_keys
3231
+ and c not in sort_exclude_columns
3232
+ and df[c].nunique() > 1
3061
3233
  ]
3062
- # [
3063
- # sk
3064
- # for sk, key_type in search_keys.items()
3065
- # if key_type not in [SearchKey.DATE, SearchKey.DATETIME]
3066
- # and sk in df.columns
3067
- # and df[sk].nunique() > 1 # don't use constant keys for hash
3068
- # ]
3069
3234
  )
3070
3235
 
3236
+ all_other_columns = sorted_other_keys + other_columns
3237
+
3071
3238
  search_keys_hash = "search_keys_hash"
3072
- if len(other_columns) > 0:
3239
+ if len(all_other_columns) > 0:
3073
3240
  sort_columns.append(search_keys_hash)
3074
- df[search_keys_hash] = pd.util.hash_pandas_object(df[other_columns], index=False)
3241
+ df[search_keys_hash] = pd.util.hash_pandas_object(df[all_other_columns], index=False)
3075
3242
 
3076
3243
  df = df.sort_values(by=sort_columns)
3077
3244
 
3078
3245
  if search_keys_hash in df.columns:
3079
3246
  df.drop(columns=search_keys_hash, inplace=True)
3080
3247
 
3081
- if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
3082
- df.drop(columns=DateTimeSearchKeyConverter.DATETIME_COL, inplace=True)
3083
-
3084
3248
  df = df.reset_index(drop=True).reset_index()
3085
3249
  # system_record_id saves correct order for fit
3086
3250
  df = df.rename(columns={DEFAULT_INDEX: id_name})
@@ -3090,11 +3254,11 @@ class FeaturesEnricher(TransformerMixin):
3090
3254
  df.index.name = original_index_name
3091
3255
  df = df.sort_values(by=original_order_name).drop(columns=original_order_name)
3092
3256
 
3093
- meaning_types[id_name] = (
3094
- FileColumnMeaningType.SYSTEM_RECORD_ID
3095
- if id_name == SYSTEM_RECORD_ID
3096
- else FileColumnMeaningType.ENTITY_SYSTEM_RECORD_ID
3097
- )
3257
+ # meaning_types[id_name] = (
3258
+ # FileColumnMeaningType.SYSTEM_RECORD_ID
3259
+ # if id_name == SYSTEM_RECORD_ID
3260
+ # else FileColumnMeaningType.ENTITY_SYSTEM_RECORD_ID
3261
+ # )
3098
3262
  return df
3099
3263
 
3100
3264
  def __correct_target(self, df: pd.DataFrame) -> pd.DataFrame:
@@ -3472,10 +3636,13 @@ class FeaturesEnricher(TransformerMixin):
3472
3636
  for _, key_type in search_keys.items():
3473
3637
  if not isinstance(key_type, SearchKey):
3474
3638
  raise ValidationError(self.bundle.get("unsupported_type_of_search_key").format(key_type))
3639
+
3475
3640
  valid_search_keys = {}
3476
3641
  unsupported_search_keys = {
3477
3642
  SearchKey.IP_RANGE_FROM,
3478
3643
  SearchKey.IP_RANGE_TO,
3644
+ SearchKey.IPV6_RANGE_FROM,
3645
+ SearchKey.IPV6_RANGE_TO,
3479
3646
  SearchKey.MSISDN_RANGE_FROM,
3480
3647
  SearchKey.MSISDN_RANGE_TO,
3481
3648
  # SearchKey.EMAIL_ONE_DOMAIN,
@@ -3565,6 +3732,7 @@ class FeaturesEnricher(TransformerMixin):
3565
3732
  print(msg)
3566
3733
  self.logger.warning(msg)
3567
3734
  self.warning_counter.increment()
3735
+ # TODO maybe raise ValidationError
3568
3736
 
3569
3737
  self.logger.info(f"Prepared search keys: {valid_search_keys}")
3570
3738