upgini 1.1.315a3579.dev1__py3-none-any.whl → 1.1.316a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

@@ -23,7 +23,6 @@ from pandas.api.types import (
23
23
  is_datetime64_any_dtype,
24
24
  is_numeric_dtype,
25
25
  is_object_dtype,
26
- is_period_dtype,
27
26
  is_string_dtype,
28
27
  )
29
28
  from scipy.stats import ks_2samp
@@ -61,11 +60,15 @@ from upgini.metadata import (
61
60
  SearchKey,
62
61
  )
63
62
  from upgini.metrics import EstimatorWrapper, validate_scoring_argument
63
+ from upgini.normalizer.normalize_utils import Normalizer
64
64
  from upgini.resource_bundle import ResourceBundle, bundle, get_custom_bundle
65
65
  from upgini.search_task import SearchTask
66
66
  from upgini.spinner import Spinner
67
67
  from upgini.utils import combine_search_keys, find_numbers_with_decimal_comma
68
- from upgini.utils.country_utils import CountrySearchKeyDetector
68
+ from upgini.utils.country_utils import (
69
+ CountrySearchKeyConverter,
70
+ CountrySearchKeyDetector,
71
+ )
69
72
  from upgini.utils.custom_loss_utils import (
70
73
  get_additional_params_custom_loss,
71
74
  get_runtime_params_custom_loss,
@@ -87,11 +90,19 @@ from upgini.utils.display_utils import (
87
90
  prepare_and_show_report,
88
91
  show_request_quote_button,
89
92
  )
90
- from upgini.utils.email_utils import EmailSearchKeyConverter, EmailSearchKeyDetector
93
+ from upgini.utils.email_utils import (
94
+ EmailDomainGenerator,
95
+ EmailSearchKeyConverter,
96
+ EmailSearchKeyDetector,
97
+ )
91
98
  from upgini.utils.features_validator import FeaturesValidator
92
99
  from upgini.utils.format import Format
93
- from upgini.utils.phone_utils import PhoneSearchKeyDetector
94
- from upgini.utils.postal_code_utils import PostalCodeSearchKeyDetector
100
+ from upgini.utils.ip_utils import IpSearchKeyConverter
101
+ from upgini.utils.phone_utils import PhoneSearchKeyConverter, PhoneSearchKeyDetector
102
+ from upgini.utils.postal_code_utils import (
103
+ PostalCodeSearchKeyConverter,
104
+ PostalCodeSearchKeyDetector,
105
+ )
95
106
 
96
107
  try:
97
108
  from upgini.utils.progress_bar import CustomProgressBar as ProgressBar
@@ -237,6 +248,7 @@ class FeaturesEnricher(TransformerMixin):
237
248
 
238
249
  self.passed_features: List[str] = []
239
250
  self.df_with_original_index: Optional[pd.DataFrame] = None
251
+ self.fit_columns_renaming: Optional[Dict[str, str]] = None
240
252
  self.country_added = False
241
253
  self.fit_generated_features: List[str] = []
242
254
  self.fit_dropped_features: Set[str] = set()
@@ -247,7 +259,7 @@ class FeaturesEnricher(TransformerMixin):
247
259
  self.eval_set: Optional[List[Tuple]] = None
248
260
  self.autodetected_search_keys: Dict[str, SearchKey] = {}
249
261
  self.imbalanced = False
250
- self.__cached_sampled_datasets: Optional[Tuple[pd.DataFrame, pd.DataFrame, pd.Series, Dict, Dict]] = None
262
+ self.__cached_sampled_datasets: Optional[Tuple[pd.DataFrame, pd.DataFrame, pd.Series, Dict, Dict, Dict]] = None
251
263
 
252
264
  validate_version(self.logger)
253
265
  self.search_keys = search_keys or {}
@@ -706,7 +718,7 @@ class FeaturesEnricher(TransformerMixin):
706
718
 
707
719
  start_time = time.time()
708
720
  try:
709
- result = self.__inner_transform(
721
+ result, _ = self.__inner_transform(
710
722
  trace_id,
711
723
  X,
712
724
  exclude_features_sources=exclude_features_sources,
@@ -833,17 +845,37 @@ class FeaturesEnricher(TransformerMixin):
833
845
  self.logger.warning(msg)
834
846
  print(msg)
835
847
 
848
+ if X is not None and y is None:
849
+ raise ValidationError("X passed without y")
850
+
836
851
  self.__validate_search_keys(self.search_keys, self.search_id)
837
852
  effective_X = X if X is not None else self.X
838
853
  effective_y = y if y is not None else self.y
839
854
  effective_eval_set = eval_set if eval_set is not None else self.eval_set
840
855
  effective_eval_set = self._check_eval_set(effective_eval_set, effective_X, self.bundle)
841
856
 
857
+ if (
858
+ self._search_task is None
859
+ or self._search_task.provider_metadata_v2 is None
860
+ or len(self._search_task.provider_metadata_v2) == 0
861
+ or effective_X is None
862
+ or effective_y is None
863
+ ):
864
+ raise ValidationError(self.bundle.get("metrics_unfitted_enricher"))
865
+
866
+ validated_X = self._validate_X(effective_X)
867
+ validated_y = self._validate_y(validated_X, effective_y)
868
+ validated_eval_set = (
869
+ [self._validate_eval_set_pair(validated_X, eval_pair) for eval_pair in effective_eval_set]
870
+ if effective_eval_set is not None
871
+ else None
872
+ )
873
+
842
874
  try:
843
875
  self.__log_debug_information(
844
- effective_X,
845
- effective_y,
846
- effective_eval_set,
876
+ validated_X,
877
+ validated_y,
878
+ validated_eval_set,
847
879
  exclude_features_sources=exclude_features_sources,
848
880
  cv=cv if cv is not None else self.cv,
849
881
  importance_threshold=importance_threshold,
@@ -853,21 +885,9 @@ class FeaturesEnricher(TransformerMixin):
853
885
  remove_outliers_calc_metrics=remove_outliers_calc_metrics,
854
886
  )
855
887
 
856
- if (
857
- self._search_task is None
858
- or self._search_task.provider_metadata_v2 is None
859
- or len(self._search_task.provider_metadata_v2) == 0
860
- or effective_X is None
861
- or effective_y is None
862
- ):
863
- raise ValidationError(self.bundle.get("metrics_unfitted_enricher"))
864
-
865
- if X is not None and y is None:
866
- raise ValidationError("X passed without y")
867
-
868
888
  validate_scoring_argument(scoring)
869
889
 
870
- self._validate_baseline_score(effective_X, effective_eval_set)
890
+ self._validate_baseline_score(validated_X, validated_eval_set)
871
891
 
872
892
  if self._has_paid_features(exclude_features_sources):
873
893
  msg = self.bundle.get("metrics_with_paid_features")
@@ -876,7 +896,7 @@ class FeaturesEnricher(TransformerMixin):
876
896
  return None
877
897
 
878
898
  cat_features, search_keys_for_metrics = self._get_client_cat_features(
879
- estimator, effective_X, self.search_keys
899
+ estimator, validated_X, self.search_keys
880
900
  )
881
901
 
882
902
  prepared_data = self._prepare_data_for_metrics(
@@ -906,8 +926,16 @@ class FeaturesEnricher(TransformerMixin):
906
926
  search_keys,
907
927
  groups,
908
928
  _cv,
929
+ columns_renaming,
909
930
  ) = prepared_data
910
931
 
932
+ # rename cat_features
933
+ if cat_features:
934
+ for new_c, old_c in columns_renaming.items():
935
+ if old_c in cat_features:
936
+ cat_features.remove(old_c)
937
+ cat_features.append(new_c)
938
+
911
939
  gc.collect()
912
940
 
913
941
  print(self.bundle.get("metrics_start"))
@@ -920,7 +948,7 @@ class FeaturesEnricher(TransformerMixin):
920
948
 
921
949
  self._check_train_and_eval_target_distribution(y_sorted, fitting_eval_set_dict)
922
950
 
923
- has_date = self._get_date_column(search_keys) is not None
951
+ has_date = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME]) is not None
924
952
  model_task_type = self.model_task_type or define_task(y_sorted, has_date, self.logger, silent=True)
925
953
 
926
954
  wrapper = EstimatorWrapper.create(
@@ -1013,10 +1041,10 @@ class FeaturesEnricher(TransformerMixin):
1013
1041
  self.bundle.get("quality_metrics_rows_header"): _num_samples(effective_X),
1014
1042
  }
1015
1043
  if model_task_type in [ModelTaskType.BINARY, ModelTaskType.REGRESSION] and is_numeric_dtype(
1016
- y_sorted
1044
+ validated_y
1017
1045
  ):
1018
1046
  train_metrics[self.bundle.get("quality_metrics_mean_target_header")] = round(
1019
- np.mean(effective_y), 4
1047
+ np.mean(validated_y), 4
1020
1048
  )
1021
1049
  if etalon_metric is not None:
1022
1050
  train_metrics[self.bundle.get("quality_metrics_baseline_header").format(metric)] = etalon_metric
@@ -1086,10 +1114,10 @@ class FeaturesEnricher(TransformerMixin):
1086
1114
  # self.bundle.get("quality_metrics_match_rate_header"): eval_hit_rate,
1087
1115
  }
1088
1116
  if model_task_type in [ModelTaskType.BINARY, ModelTaskType.REGRESSION] and is_numeric_dtype(
1089
- eval_y_sorted
1117
+ validated_eval_set[idx][1]
1090
1118
  ):
1091
1119
  eval_metrics[self.bundle.get("quality_metrics_mean_target_header")] = round(
1092
- np.mean(effective_eval_set[idx][1]), 4
1120
+ np.mean(validated_eval_set[idx][1]), 4
1093
1121
  )
1094
1122
  if etalon_eval_metric is not None:
1095
1123
  eval_metrics[self.bundle.get("quality_metrics_baseline_header").format(metric)] = (
@@ -1113,7 +1141,7 @@ class FeaturesEnricher(TransformerMixin):
1113
1141
  )
1114
1142
 
1115
1143
  uplift_col = self.bundle.get("quality_metrics_uplift_header")
1116
- date_column = self._get_date_column(search_keys)
1144
+ date_column = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
1117
1145
  if (
1118
1146
  uplift_col in metrics_df.columns
1119
1147
  and (metrics_df[uplift_col] < 0).any()
@@ -1195,27 +1223,6 @@ class FeaturesEnricher(TransformerMixin):
1195
1223
  def _has_paid_features(self, exclude_features_sources: Optional[List[str]]) -> bool:
1196
1224
  return self._has_features_with_commercial_schema(CommercialSchema.PAID.value, exclude_features_sources)
1197
1225
 
1198
- def _extend_x(self, x: pd.DataFrame, is_demo_dataset: bool) -> Tuple[pd.DataFrame, Dict[str, SearchKey]]:
1199
- search_keys = self.search_keys.copy()
1200
- search_keys = self.__prepare_search_keys(x, search_keys, is_demo_dataset, is_transform=True, silent_mode=True)
1201
-
1202
- extended_X = x.copy()
1203
- generated_features = []
1204
- date_column = self._get_date_column(search_keys)
1205
- if date_column is not None:
1206
- converter = DateTimeSearchKeyConverter(date_column, self.date_format, self.logger, self.bundle)
1207
- extended_X = converter.convert(extended_X, keep_time=True)
1208
- generated_features.extend(converter.generated_features)
1209
- email_column = self._get_email_column(search_keys)
1210
- hem_column = self._get_hem_column(search_keys)
1211
- if email_column:
1212
- converter = EmailSearchKeyConverter(email_column, hem_column, search_keys, [], self.logger)
1213
- extended_X = converter.convert(extended_X)
1214
- generated_features.extend(converter.generated_features)
1215
- generated_features = [f for f in generated_features if f in self.fit_generated_features]
1216
-
1217
- return extended_X, search_keys
1218
-
1219
1226
  def _is_input_same_as_fit(
1220
1227
  self,
1221
1228
  X: Union[pd.DataFrame, pd.Series, np.ndarray, None] = None,
@@ -1259,7 +1266,7 @@ class FeaturesEnricher(TransformerMixin):
1259
1266
  groups = None
1260
1267
 
1261
1268
  if not isinstance(_cv, BaseCrossValidator):
1262
- date_column = self._get_date_column(search_keys)
1269
+ date_column = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
1263
1270
  date_series = X[date_column] if date_column is not None else None
1264
1271
  _cv, groups = CVConfig(
1265
1272
  _cv, date_series, self.random_state, self._search_task.get_shuffle_kfold(), group_columns=group_columns
@@ -1282,7 +1289,7 @@ class FeaturesEnricher(TransformerMixin):
1282
1289
 
1283
1290
  def _get_client_cat_features(
1284
1291
  self, estimator: Optional[Any], X: pd.DataFrame, search_keys: Dict[str, SearchKey]
1285
- ) -> Optional[List[str]]:
1292
+ ) -> Tuple[Optional[List[str]], List[str]]:
1286
1293
  cat_features = None
1287
1294
  search_keys_for_metrics = []
1288
1295
  if (
@@ -1342,11 +1349,15 @@ class FeaturesEnricher(TransformerMixin):
1342
1349
  progress_bar,
1343
1350
  progress_callback,
1344
1351
  )
1345
- X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys = dataclasses.astuple(sampled_data)
1352
+ X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys, columns_renaming = dataclasses.astuple(
1353
+ sampled_data
1354
+ )
1346
1355
 
1347
1356
  excluding_search_keys = list(search_keys.keys())
1348
1357
  if search_keys_for_metrics is not None and len(search_keys_for_metrics) > 0:
1349
- excluding_search_keys = [sk for sk in excluding_search_keys if sk not in search_keys_for_metrics]
1358
+ for sk in excluding_search_keys:
1359
+ if columns_renaming.get(sk) in search_keys_for_metrics:
1360
+ excluding_search_keys.remove(sk)
1350
1361
 
1351
1362
  client_features = [
1352
1363
  c
@@ -1363,6 +1374,7 @@ class FeaturesEnricher(TransformerMixin):
1363
1374
  importance_threshold,
1364
1375
  max_features,
1365
1376
  )
1377
+ filtered_enriched_features = [c for c in filtered_enriched_features if c not in client_features]
1366
1378
 
1367
1379
  X_sorted, y_sorted = self._sort_by_system_record_id(X_sampled, y_sampled, self.cv)
1368
1380
  enriched_X_sorted, enriched_y_sorted = self._sort_by_system_record_id(enriched_X, y_sampled, self.cv)
@@ -1392,9 +1404,12 @@ class FeaturesEnricher(TransformerMixin):
1392
1404
  fitting_X = fitting_X.drop(columns=constant_columns, errors="ignore")
1393
1405
  fitting_enriched_X = fitting_enriched_X.drop(columns=constant_columns, errors="ignore")
1394
1406
 
1407
+ # TODO maybe there is no more need for these convertions
1395
1408
  # Remove datetime features
1396
1409
  datetime_features = [
1397
- f for f in fitting_X.columns if is_datetime64_any_dtype(fitting_X[f]) or is_period_dtype(fitting_X[f])
1410
+ f
1411
+ for f in fitting_X.columns
1412
+ if is_datetime64_any_dtype(fitting_X[f]) or isinstance(fitting_X[f].dtype, pd.PeriodDtype)
1398
1413
  ]
1399
1414
  if len(datetime_features) > 0:
1400
1415
  self.logger.warning(self.bundle.get("dataset_date_features").format(datetime_features))
@@ -1479,6 +1494,7 @@ class FeaturesEnricher(TransformerMixin):
1479
1494
  search_keys,
1480
1495
  groups,
1481
1496
  cv,
1497
+ columns_renaming,
1482
1498
  )
1483
1499
 
1484
1500
  @dataclass
@@ -1488,6 +1504,7 @@ class FeaturesEnricher(TransformerMixin):
1488
1504
  enriched_X: pd.DataFrame
1489
1505
  eval_set_sampled_dict: Dict[int, Tuple[pd.DataFrame, pd.Series]]
1490
1506
  search_keys: Dict[str, SearchKey]
1507
+ columns_renaming: Dict[str, str]
1491
1508
 
1492
1509
  def _sample_data_for_metrics(
1493
1510
  self,
@@ -1527,11 +1544,15 @@ class FeaturesEnricher(TransformerMixin):
1527
1544
  )
1528
1545
 
1529
1546
  def __get_sampled_cached_enriched(self, exclude_features_sources: Optional[List[str]]) -> _SampledDataForMetrics:
1530
- X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys = self.__cached_sampled_datasets
1547
+ X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys, columns_renaming = (
1548
+ self.__cached_sampled_datasets
1549
+ )
1531
1550
  if exclude_features_sources:
1532
1551
  enriched_X = enriched_X.drop(columns=exclude_features_sources, errors="ignore")
1533
1552
 
1534
- return self.__mk_sampled_data_tuple(X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys)
1553
+ return self.__mk_sampled_data_tuple(
1554
+ X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys, columns_renaming
1555
+ )
1535
1556
 
1536
1557
  def __sample_only_input(
1537
1558
  self, validated_X: pd.DataFrame, validated_y: pd.Series, eval_set: Optional[List[tuple]], is_demo_dataset: bool
@@ -1549,6 +1570,28 @@ class FeaturesEnricher(TransformerMixin):
1549
1570
  eval_xy[EVAL_SET_INDEX] = idx + 1
1550
1571
  df = pd.concat([df, eval_xy])
1551
1572
 
1573
+ search_keys = self.search_keys.copy()
1574
+ search_keys = self.__prepare_search_keys(df, search_keys, is_demo_dataset, is_transform=True, silent_mode=True)
1575
+
1576
+ date_column = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
1577
+ generated_features = []
1578
+ if date_column is not None:
1579
+ converter = DateTimeSearchKeyConverter(
1580
+ date_column, self.date_format, self.logger, self.bundle, silent_mode=True
1581
+ )
1582
+ df = converter.convert(df, keep_time=True)
1583
+ generated_features = converter.generated_features
1584
+
1585
+ email_columns = SearchKey.find_all_keys(search_keys, SearchKey.EMAIL)
1586
+ if email_columns:
1587
+ generator = EmailDomainGenerator(email_columns)
1588
+ df = generator.generate(df)
1589
+ generated_features.extend(generator.generated_features)
1590
+
1591
+ normalizer = Normalizer(self.search_keys, generated_features, self.bundle, self.logger, self.warning_counter)
1592
+ df = normalizer.normalize(df)
1593
+ columns_renaming = normalizer.columns_renaming
1594
+
1552
1595
  df = clean_full_duplicates(df, logger=self.logger, silent=True, bundle=self.bundle)
1553
1596
 
1554
1597
  num_samples = _num_samples(df)
@@ -1561,24 +1604,34 @@ class FeaturesEnricher(TransformerMixin):
1561
1604
  self.logger.info(f"Downsampling from {num_samples} to {sample_rows}")
1562
1605
  df = df.sample(n=sample_rows, random_state=self.random_state)
1563
1606
 
1564
- df_extended, search_keys = self._extend_x(df, is_demo_dataset)
1565
- df_extended = self.__add_fit_system_record_id(df_extended, {}, search_keys, SYSTEM_RECORD_ID)
1607
+ df = self.__add_fit_system_record_id(df, search_keys, SYSTEM_RECORD_ID)
1608
+ if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
1609
+ df = df.drop(columns=DateTimeSearchKeyConverter.DATETIME_COL)
1566
1610
 
1567
- train_df = df_extended.query(f"{EVAL_SET_INDEX} == 0") if eval_set is not None else df_extended
1611
+ train_df = df.query(f"{EVAL_SET_INDEX} == 0") if eval_set is not None else df
1568
1612
  X_sampled = train_df.drop(columns=[TARGET, EVAL_SET_INDEX], errors="ignore")
1569
1613
  y_sampled = train_df[TARGET].copy()
1570
1614
  enriched_X = X_sampled
1571
1615
 
1572
1616
  if eval_set is not None:
1573
1617
  for idx in range(len(eval_set)):
1574
- eval_xy_sampled = df_extended.query(f"{EVAL_SET_INDEX} == {idx + 1}")
1618
+ eval_xy_sampled = df.query(f"{EVAL_SET_INDEX} == {idx + 1}")
1575
1619
  eval_X_sampled = eval_xy_sampled.drop(columns=[TARGET, EVAL_SET_INDEX], errors="ignore")
1576
1620
  eval_y_sampled = eval_xy_sampled[TARGET].copy()
1577
1621
  enriched_eval_X = eval_X_sampled
1578
1622
  eval_set_sampled_dict[idx] = (eval_X_sampled, enriched_eval_X, eval_y_sampled)
1579
- self.__cached_sampled_datasets = (X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys)
1623
+ self.__cached_sampled_datasets = (
1624
+ X_sampled,
1625
+ y_sampled,
1626
+ enriched_X,
1627
+ eval_set_sampled_dict,
1628
+ search_keys,
1629
+ columns_renaming,
1630
+ )
1580
1631
 
1581
- return self.__mk_sampled_data_tuple(X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys)
1632
+ return self.__mk_sampled_data_tuple(
1633
+ X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys, columns_renaming
1634
+ )
1582
1635
 
1583
1636
  def __sample_balanced(
1584
1637
  self,
@@ -1590,7 +1643,7 @@ class FeaturesEnricher(TransformerMixin):
1590
1643
  search_keys = self.fit_search_keys
1591
1644
 
1592
1645
  rows_to_drop = None
1593
- has_date = self._get_date_column(search_keys) is not None
1646
+ has_date = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME]) is not None
1594
1647
  task_type = self.model_task_type or define_task(
1595
1648
  self.df_with_original_index[TARGET], has_date, self.logger, silent=True
1596
1649
  )
@@ -1644,9 +1697,18 @@ class FeaturesEnricher(TransformerMixin):
1644
1697
  enriched_eval_X = enriched_eval_sets[idx + 1].drop(columns=[TARGET, EVAL_SET_INDEX])
1645
1698
  eval_set_sampled_dict[idx] = (eval_X_sampled, enriched_eval_X, eval_y_sampled)
1646
1699
 
1647
- self.__cached_sampled_datasets = (X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys)
1700
+ self.__cached_sampled_datasets = (
1701
+ X_sampled,
1702
+ y_sampled,
1703
+ enriched_X,
1704
+ eval_set_sampled_dict,
1705
+ search_keys,
1706
+ self.fit_columns_renaming,
1707
+ )
1648
1708
 
1649
- return self.__mk_sampled_data_tuple(X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys)
1709
+ return self.__mk_sampled_data_tuple(
1710
+ X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys, self.fit_columns_renaming
1711
+ )
1650
1712
 
1651
1713
  def __sample_imbalanced(
1652
1714
  self,
@@ -1686,7 +1748,7 @@ class FeaturesEnricher(TransformerMixin):
1686
1748
  tmp_target_name = "__target"
1687
1749
  df = df.rename(columns={TARGET: tmp_target_name})
1688
1750
 
1689
- enriched_df = self.__inner_transform(
1751
+ enriched_df, columns_renaming = self.__inner_transform(
1690
1752
  trace_id,
1691
1753
  df,
1692
1754
  exclude_features_sources=exclude_features_sources,
@@ -1734,7 +1796,7 @@ class FeaturesEnricher(TransformerMixin):
1734
1796
  tmp_target_name = "__target"
1735
1797
  df = df.rename(columns={TARGET: tmp_target_name})
1736
1798
 
1737
- enriched_Xy = self.__inner_transform(
1799
+ enriched_Xy, columns_renaming = self.__inner_transform(
1738
1800
  trace_id,
1739
1801
  df,
1740
1802
  exclude_features_sources=exclude_features_sources,
@@ -1759,9 +1821,18 @@ class FeaturesEnricher(TransformerMixin):
1759
1821
  y_sampled = enriched_Xy[TARGET].copy()
1760
1822
  enriched_X = enriched_Xy.drop(columns=TARGET)
1761
1823
 
1762
- self.__cached_sampled_datasets = (X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, self.search_keys)
1824
+ self.__cached_sampled_datasets = (
1825
+ X_sampled,
1826
+ y_sampled,
1827
+ enriched_X,
1828
+ eval_set_sampled_dict,
1829
+ self.search_keys,
1830
+ columns_renaming,
1831
+ )
1763
1832
 
1764
- return self.__mk_sampled_data_tuple(X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, self.search_keys)
1833
+ return self.__mk_sampled_data_tuple(
1834
+ X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, self.search_keys, columns_renaming
1835
+ )
1765
1836
 
1766
1837
  def __mk_sampled_data_tuple(
1767
1838
  self,
@@ -1770,6 +1841,7 @@ class FeaturesEnricher(TransformerMixin):
1770
1841
  enriched_X: pd.DataFrame,
1771
1842
  eval_set_sampled_dict: Dict,
1772
1843
  search_keys: Dict,
1844
+ columns_renaming: Dict[str, str],
1773
1845
  ):
1774
1846
  search_keys = {k: v for k, v in search_keys.items() if k in X_sampled.columns.to_list()}
1775
1847
  return FeaturesEnricher._SampledDataForMetrics(
@@ -1778,6 +1850,7 @@ class FeaturesEnricher(TransformerMixin):
1778
1850
  enriched_X=enriched_X,
1779
1851
  eval_set_sampled_dict=eval_set_sampled_dict,
1780
1852
  search_keys=search_keys,
1853
+ columns_renaming=columns_renaming,
1781
1854
  )
1782
1855
 
1783
1856
  def get_search_id(self) -> Optional[str]:
@@ -1866,7 +1939,7 @@ class FeaturesEnricher(TransformerMixin):
1866
1939
  progress_bar: Optional[ProgressBar] = None,
1867
1940
  progress_callback: Optional[Callable[[SearchProgress], Any]] = None,
1868
1941
  add_fit_system_record_id: bool = False,
1869
- ) -> pd.DataFrame:
1942
+ ) -> Tuple[pd.DataFrame, Dict[str, str]]:
1870
1943
  if self._search_task is None:
1871
1944
  raise NotFittedError(self.bundle.get("transform_unfitted_enricher"))
1872
1945
 
@@ -1879,13 +1952,13 @@ class FeaturesEnricher(TransformerMixin):
1879
1952
 
1880
1953
  if len(self.feature_names_) == 0:
1881
1954
  self.logger.warning(self.bundle.get("no_important_features_for_transform"))
1882
- return X
1955
+ return X, {c: c for c in X.columns}
1883
1956
 
1884
1957
  if self._has_paid_features(exclude_features_sources):
1885
1958
  msg = self.bundle.get("transform_with_paid_features")
1886
1959
  self.logger.warning(msg)
1887
1960
  self.__display_support_link(msg)
1888
- return None
1961
+ return None, {c: c for c in X.columns}
1889
1962
 
1890
1963
  if not metrics_calculation:
1891
1964
  transform_usage = self.rest_client.get_current_transform_usage(trace_id)
@@ -1896,7 +1969,7 @@ class FeaturesEnricher(TransformerMixin):
1896
1969
  self.logger.warning(msg)
1897
1970
  print(msg)
1898
1971
  show_request_quote_button()
1899
- return None
1972
+ return None, {c: c for c in X.columns}
1900
1973
  else:
1901
1974
  msg = self.bundle.get("transform_usage_info").format(
1902
1975
  transform_usage.limit, transform_usage.transformed_rows
@@ -1934,9 +2007,11 @@ class FeaturesEnricher(TransformerMixin):
1934
2007
  df = self.__add_country_code(df, search_keys)
1935
2008
 
1936
2009
  generated_features = []
1937
- date_column = self._get_date_column(search_keys)
2010
+ date_column = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
1938
2011
  if date_column is not None:
1939
- converter = DateTimeSearchKeyConverter(date_column, self.date_format, self.logger, bundle=self.bundle)
2012
+ converter = DateTimeSearchKeyConverter(
2013
+ date_column, self.date_format, self.logger, bundle=self.bundle, silent_mode=silent_mode
2014
+ )
1940
2015
  df = converter.convert(df)
1941
2016
  self.logger.info(f"Date column after convertion: {df[date_column]}")
1942
2017
  generated_features.extend(converter.generated_features)
@@ -1945,61 +2020,93 @@ class FeaturesEnricher(TransformerMixin):
1945
2020
  if self.add_date_if_missing:
1946
2021
  df = self._add_current_date_as_key(df, search_keys, self.logger, self.bundle)
1947
2022
 
2023
+ email_columns = SearchKey.find_all_keys(search_keys, SearchKey.EMAIL)
2024
+ if email_columns:
2025
+ generator = EmailDomainGenerator(email_columns)
2026
+ df = generator.generate(df)
2027
+ generated_features.extend(generator.generated_features)
2028
+
2029
+ normalizer = Normalizer(
2030
+ search_keys, generated_features, self.bundle, self.logger, self.warning_counter, silent_mode
2031
+ )
2032
+ df = normalizer.normalize(df)
2033
+ columns_renaming = normalizer.columns_renaming
2034
+
1948
2035
  # Don't pass all features in backend on transform
1949
- original_features_for_transform = []
1950
2036
  runtime_parameters = self._get_copy_of_runtime_parameters()
1951
- features_not_to_pass = [column for column in df.columns if column not in search_keys.keys()]
1952
- if len(features_not_to_pass) > 0:
1953
- # Pass only features that need for transform
1954
- features_for_transform = self._search_task.get_features_for_transform()
1955
- if features_for_transform is not None and len(features_for_transform) > 0:
1956
- file_metadata = self._search_task.get_file_metadata(trace_id)
1957
- original_features_for_transform = [
1958
- c.originalName or c.name for c in file_metadata.columns if c.name in features_for_transform
1959
- ]
2037
+ features_for_transform = self._search_task.get_features_for_transform() or []
2038
+ if len(features_for_transform) > 0:
2039
+ runtime_parameters.properties["features_for_embeddings"] = ",".join(features_for_transform)
1960
2040
 
1961
- runtime_parameters.properties["features_for_embeddings"] = ",".join(features_for_transform)
1962
-
1963
- columns_for_system_record_id = sorted(list(search_keys.keys()) + (original_features_for_transform))
2041
+ columns_for_system_record_id = sorted(list(search_keys.keys()) + features_for_transform)
1964
2042
 
1965
2043
  df[ENTITY_SYSTEM_RECORD_ID] = pd.util.hash_pandas_object(
1966
2044
  df[columns_for_system_record_id], index=False
1967
2045
  ).astype("Float64")
1968
2046
 
1969
2047
  # Explode multiple search keys
1970
- df, unnest_search_keys = self._explode_multiple_search_keys(df, search_keys)
2048
+ df, unnest_search_keys = self._explode_multiple_search_keys(df, search_keys, columns_renaming)
1971
2049
 
1972
2050
  email_column = self._get_email_column(search_keys)
1973
2051
  hem_column = self._get_hem_column(search_keys)
1974
- email_converted_to_hem = False
1975
2052
  if email_column:
1976
2053
  converter = EmailSearchKeyConverter(
1977
- email_column, hem_column, search_keys, list(unnest_search_keys.keys()), self.logger
2054
+ email_column,
2055
+ hem_column,
2056
+ search_keys,
2057
+ columns_renaming,
2058
+ list(unnest_search_keys.keys()),
2059
+ self.logger,
1978
2060
  )
1979
2061
  df = converter.convert(df)
1980
- generated_features.extend(converter.generated_features)
1981
- email_converted_to_hem = converter.email_converted_to_hem
2062
+
2063
+ ip_column = self._get_ip_column(search_keys)
2064
+ if ip_column:
2065
+ converter = IpSearchKeyConverter(
2066
+ ip_column,
2067
+ search_keys,
2068
+ columns_renaming,
2069
+ list(unnest_search_keys.keys()),
2070
+ self.bundle,
2071
+ self.logger,
2072
+ )
2073
+ df = converter.convert(df)
2074
+
2075
+ phone_column = self._get_phone_column(search_keys)
2076
+ country_column = self._get_country_column(search_keys)
2077
+ if phone_column:
2078
+ converter = PhoneSearchKeyConverter(phone_column, country_column)
2079
+ df = converter.convert(df)
2080
+
2081
+ if country_column:
2082
+ converter = CountrySearchKeyConverter(country_column)
2083
+ df = converter.convert(df)
2084
+
2085
+ postal_code = self._get_postal_column(search_keys)
2086
+ if postal_code:
2087
+ converter = PostalCodeSearchKeyConverter(postal_code)
2088
+ df = converter.convert(df)
2089
+
1982
2090
  generated_features = [f for f in generated_features if f in self.fit_generated_features]
1983
2091
 
1984
2092
  meaning_types = {col: key.value for col, key in search_keys.items()}
1985
- # non_keys_columns = [column for column in df.columns if column not in search_keys.keys()]
1986
- for col in original_features_for_transform:
2093
+ for col in features_for_transform:
1987
2094
  meaning_types[col] = FileColumnMeaningType.FEATURE
1988
- features_not_to_pass = [column for column in features_not_to_pass if column not in search_keys.keys()]
1989
-
1990
- if email_converted_to_hem:
1991
- features_not_to_pass.append(email_column)
1992
-
1993
- features_not_to_pass = [c for c in features_not_to_pass if c not in original_features_for_transform]
1994
- columns_for_system_record_id = sorted(list(search_keys.keys()) + (original_features_for_transform))
2095
+ features_not_to_pass = [
2096
+ c
2097
+ for c in df.columns
2098
+ if c not in search_keys.keys() and c not in features_for_transform and c != ENTITY_SYSTEM_RECORD_ID
2099
+ ]
1995
2100
 
1996
2101
  if add_fit_system_record_id:
1997
- df = self.__add_fit_system_record_id(df, {}, search_keys, SYSTEM_RECORD_ID)
2102
+ df = self.__add_fit_system_record_id(df, search_keys, SYSTEM_RECORD_ID)
2103
+ if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
2104
+ df = df.drop(columns=DateTimeSearchKeyConverter.DATETIME_COL)
1998
2105
  df = df.rename(columns={SYSTEM_RECORD_ID: SORT_ID})
1999
2106
  features_not_to_pass.append(SORT_ID)
2000
2107
 
2001
- columns_for_system_record_id = sorted(list(search_keys.keys()) + (original_features_for_transform or []))
2002
-
2108
+ # search keys might be changed after explode
2109
+ columns_for_system_record_id = sorted(list(search_keys.keys()) + features_for_transform)
2003
2110
  df[SYSTEM_RECORD_ID] = pd.util.hash_pandas_object(df[columns_for_system_record_id], index=False).astype(
2004
2111
  "Float64"
2005
2112
  )
@@ -2035,8 +2142,7 @@ class FeaturesEnricher(TransformerMixin):
2035
2142
  rest_client=self.rest_client,
2036
2143
  logger=self.logger,
2037
2144
  )
2038
- if email_converted_to_hem:
2039
- dataset.ignore_columns = [email_column]
2145
+ dataset.columns_renaming = columns_renaming
2040
2146
 
2041
2147
  if max_features is not None or importance_threshold is not None:
2042
2148
  exclude_features_sources = list(
@@ -2125,7 +2231,9 @@ class FeaturesEnricher(TransformerMixin):
2125
2231
  result = enrich()
2126
2232
 
2127
2233
  filtered_columns = self.__filtered_enriched_features(importance_threshold, max_features)
2128
- existing_filtered_columns = [c for c in filtered_columns if c in result.columns]
2234
+ existing_filtered_columns = [
2235
+ c for c in filtered_columns if c in result.columns and c not in validated_X.columns
2236
+ ]
2129
2237
  selecting_columns = validated_X.columns.tolist() + generated_features + existing_filtered_columns
2130
2238
  if add_fit_system_record_id:
2131
2239
  selecting_columns.append(SORT_ID)
@@ -2138,7 +2246,7 @@ class FeaturesEnricher(TransformerMixin):
2138
2246
  if add_fit_system_record_id:
2139
2247
  result = result.rename(columns={SORT_ID: SYSTEM_RECORD_ID})
2140
2248
 
2141
- return result
2249
+ return result, columns_renaming
2142
2250
 
2143
2251
  def _get_excluded_features(self, max_features: Optional[int], importance_threshold: Optional[float]) -> List[str]:
2144
2252
  features_info = self._internal_features_info
@@ -2239,6 +2347,9 @@ class FeaturesEnricher(TransformerMixin):
2239
2347
  self.df_with_original_index = None
2240
2348
  self.__cached_sampled_datasets = None
2241
2349
  self.metrics = None
2350
+ self.fit_columns_renaming = None
2351
+ self.fit_dropped_features = set()
2352
+ self.fit_generated_features = []
2242
2353
 
2243
2354
  validated_X = self._validate_X(X)
2244
2355
  validated_y = self._validate_y(validated_X, y)
@@ -2285,9 +2396,10 @@ class FeaturesEnricher(TransformerMixin):
2285
2396
  self.fit_search_keys = self.search_keys.copy()
2286
2397
  self.fit_search_keys = self.__prepare_search_keys(validated_X, self.fit_search_keys, is_demo_dataset)
2287
2398
 
2288
- maybe_date_column = self._get_date_column(self.fit_search_keys)
2399
+ maybe_date_column = SearchKey.find_key(self.fit_search_keys, [SearchKey.DATE, SearchKey.DATETIME])
2289
2400
  has_date = maybe_date_column is not None
2290
2401
  model_task_type = self.model_task_type or define_task(validated_y, has_date, self.logger)
2402
+
2291
2403
  self._validate_binary_observations(validated_y, model_task_type)
2292
2404
 
2293
2405
  self.runtime_parameters = get_runtime_params_custom_loss(
@@ -2317,7 +2429,13 @@ class FeaturesEnricher(TransformerMixin):
2317
2429
  self.fit_generated_features = []
2318
2430
 
2319
2431
  if has_date:
2320
- converter = DateTimeSearchKeyConverter(maybe_date_column, self.date_format, self.logger, bundle=self.bundle)
2432
+ converter = DateTimeSearchKeyConverter(
2433
+ maybe_date_column,
2434
+ self.date_format,
2435
+ self.logger,
2436
+ bundle=self.bundle,
2437
+ warnings_counter=self.warning_counter,
2438
+ )
2321
2439
  df = converter.convert(df, keep_time=True)
2322
2440
  self.logger.info(f"Date column after convertion: {df[maybe_date_column]}")
2323
2441
  self.fit_generated_features.extend(converter.generated_features)
@@ -2326,6 +2444,12 @@ class FeaturesEnricher(TransformerMixin):
2326
2444
  if self.add_date_if_missing:
2327
2445
  df = self._add_current_date_as_key(df, self.fit_search_keys, self.logger, self.bundle)
2328
2446
 
2447
+ email_columns = SearchKey.find_all_keys(self.fit_search_keys, SearchKey.EMAIL)
2448
+ if email_columns:
2449
+ generator = EmailDomainGenerator(email_columns)
2450
+ df = generator.generate(df)
2451
+ self.fit_generated_features.extend(generator.generated_features)
2452
+
2329
2453
  # Checks that need validated date
2330
2454
  validate_dates_distribution(df, self.fit_search_keys, self.logger, self.bundle, self.warning_counter)
2331
2455
 
@@ -2334,7 +2458,12 @@ class FeaturesEnricher(TransformerMixin):
2334
2458
 
2335
2459
  self.__adjust_cv(df, maybe_date_column, model_task_type)
2336
2460
 
2337
- # TODO normalize and convert all columns
2461
+ normalizer = Normalizer(
2462
+ self.fit_search_keys, self.fit_generated_features, self.bundle, self.logger, self.warning_counter
2463
+ )
2464
+ df = normalizer.normalize(df)
2465
+ columns_renaming = normalizer.columns_renaming
2466
+ self.fit_columns_renaming = columns_renaming
2338
2467
 
2339
2468
  df = remove_fintech_duplicates(
2340
2469
  df, self.fit_search_keys, date_format=self.date_format, logger=self.logger, bundle=self.bundle
@@ -2342,38 +2471,58 @@ class FeaturesEnricher(TransformerMixin):
2342
2471
  df = clean_full_duplicates(df, self.logger, bundle=self.bundle)
2343
2472
 
2344
2473
  # Explode multiple search keys
2345
- non_feature_columns = [self.TARGET_NAME, EVAL_SET_INDEX] + list(self.fit_search_keys.keys())
2346
- meaning_types = {
2347
- **{col: key.value for col, key in self.fit_search_keys.items()},
2348
- **{str(c): FileColumnMeaningType.FEATURE for c in df.columns if c not in non_feature_columns},
2349
- }
2350
- meaning_types[self.TARGET_NAME] = FileColumnMeaningType.TARGET
2351
- if eval_set is not None and len(eval_set) > 0:
2352
- meaning_types[EVAL_SET_INDEX] = FileColumnMeaningType.EVAL_SET_INDEX
2353
- df = self.__add_fit_system_record_id(df, meaning_types, self.fit_search_keys, ENTITY_SYSTEM_RECORD_ID)
2474
+ df = self.__add_fit_system_record_id(df, self.fit_search_keys, ENTITY_SYSTEM_RECORD_ID)
2354
2475
 
2355
2476
  # TODO check that this is correct for enrichment
2356
2477
  self.df_with_original_index = df.copy()
2478
+ # TODO check maybe need to drop _time column from df_with_original_index
2357
2479
 
2358
- df, unnest_search_keys = self._explode_multiple_search_keys(df, self.fit_search_keys)
2480
+ df, unnest_search_keys = self._explode_multiple_search_keys(df, self.fit_search_keys, columns_renaming)
2359
2481
 
2360
2482
  # Convert EMAIL to HEM after unnesting to do it only with one column
2361
2483
  email_column = self._get_email_column(self.fit_search_keys)
2362
2484
  hem_column = self._get_hem_column(self.fit_search_keys)
2363
- email_converted_to_hem = False
2364
2485
  if email_column:
2365
2486
  converter = EmailSearchKeyConverter(
2366
- email_column, hem_column, self.fit_search_keys, list(unnest_search_keys.keys()), self.logger
2487
+ email_column,
2488
+ hem_column,
2489
+ self.fit_search_keys,
2490
+ columns_renaming,
2491
+ list(unnest_search_keys.keys()),
2492
+ self.logger,
2367
2493
  )
2368
2494
  df = converter.convert(df)
2369
- self.fit_generated_features.extend(converter.generated_features)
2370
- email_converted_to_hem = converter.email_converted_to_hem
2495
+
2496
+ ip_column = self._get_ip_column(self.fit_search_keys)
2497
+ if ip_column:
2498
+ converter = IpSearchKeyConverter(
2499
+ ip_column,
2500
+ self.fit_search_keys,
2501
+ columns_renaming,
2502
+ list(unnest_search_keys.keys()),
2503
+ self.bundle,
2504
+ self.logger,
2505
+ )
2506
+ df = converter.convert(df)
2507
+
2508
+ phone_column = self._get_phone_column(self.fit_search_keys)
2509
+ country_column = self._get_country_column(self.fit_search_keys)
2510
+ if phone_column:
2511
+ converter = PhoneSearchKeyConverter(phone_column, country_column)
2512
+ df = converter.convert(df)
2513
+
2514
+ if country_column:
2515
+ converter = CountrySearchKeyConverter(country_column)
2516
+ df = converter.convert(df)
2517
+
2518
+ postal_code = self._get_postal_column(self.fit_search_keys)
2519
+ if postal_code:
2520
+ converter = PostalCodeSearchKeyConverter(postal_code)
2521
+ df = converter.convert(df)
2371
2522
 
2372
2523
  non_feature_columns = [self.TARGET_NAME, EVAL_SET_INDEX, ENTITY_SYSTEM_RECORD_ID, SEARCH_KEY_UNNEST] + list(
2373
2524
  self.fit_search_keys.keys()
2374
2525
  )
2375
- if email_converted_to_hem:
2376
- non_feature_columns.append(email_column)
2377
2526
  if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
2378
2527
  non_feature_columns.append(DateTimeSearchKeyConverter.DATETIME_COL)
2379
2528
 
@@ -2385,9 +2534,6 @@ class FeaturesEnricher(TransformerMixin):
2385
2534
  self.fit_dropped_features.update(features_to_drop)
2386
2535
  df = df.drop(columns=features_to_drop)
2387
2536
 
2388
- if email_converted_to_hem:
2389
- self.fit_dropped_features.add(email_column)
2390
-
2391
2537
  self.fit_generated_features = [f for f in self.fit_generated_features if f not in self.fit_dropped_features]
2392
2538
 
2393
2539
  meaning_types = {
@@ -2401,7 +2547,12 @@ class FeaturesEnricher(TransformerMixin):
2401
2547
  if eval_set is not None and len(eval_set) > 0:
2402
2548
  meaning_types[EVAL_SET_INDEX] = FileColumnMeaningType.EVAL_SET_INDEX
2403
2549
 
2404
- df = self.__add_fit_system_record_id(df, meaning_types, self.fit_search_keys, SYSTEM_RECORD_ID)
2550
+ df = self.__add_fit_system_record_id(df, self.fit_search_keys, SYSTEM_RECORD_ID)
2551
+
2552
+ if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
2553
+ df = df.drop(columns=DateTimeSearchKeyConverter.DATETIME_COL)
2554
+
2555
+ meaning_types[SYSTEM_RECORD_ID] = FileColumnMeaningType.SYSTEM_RECORD_ID
2405
2556
 
2406
2557
  df = df.reset_index(drop=True).sort_values(by=SYSTEM_RECORD_ID).reset_index(drop=True)
2407
2558
 
@@ -2419,8 +2570,7 @@ class FeaturesEnricher(TransformerMixin):
2419
2570
  rest_client=self.rest_client,
2420
2571
  logger=self.logger,
2421
2572
  )
2422
- if email_converted_to_hem:
2423
- dataset.ignore_columns = [email_column]
2573
+ dataset.columns_renaming = columns_renaming
2424
2574
 
2425
2575
  self.passed_features = [
2426
2576
  column for column, meaning_type in meaning_types.items() if meaning_type == FileColumnMeaningType.FEATURE
@@ -2809,7 +2959,7 @@ class FeaturesEnricher(TransformerMixin):
2809
2959
  if DateTimeSearchKeyConverter.DATETIME_COL in X.columns:
2810
2960
  date_column = DateTimeSearchKeyConverter.DATETIME_COL
2811
2961
  else:
2812
- date_column = FeaturesEnricher._get_date_column(search_keys)
2962
+ date_column = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
2813
2963
  sort_columns = [date_column] if date_column is not None else []
2814
2964
 
2815
2965
  # Xy = pd.concat([X, y], axis=1)
@@ -2905,10 +3055,10 @@ class FeaturesEnricher(TransformerMixin):
2905
3055
 
2906
3056
  do_without_pandas_limits(print_datasets_sample)
2907
3057
 
2908
- maybe_date_col = self._get_date_column(self.search_keys)
3058
+ maybe_date_col = SearchKey.find_key(self.search_keys, [SearchKey.DATE, SearchKey.DATETIME])
2909
3059
  if X is not None and maybe_date_col is not None and maybe_date_col in X.columns:
2910
3060
  # TODO cast date column to single dtype
2911
- date_converter = DateTimeSearchKeyConverter(maybe_date_col, self.date_format)
3061
+ date_converter = DateTimeSearchKeyConverter(maybe_date_col, self.date_format, silent_mode=True)
2912
3062
  converted_X = date_converter.convert(X)
2913
3063
  min_date = converted_X[maybe_date_col].min()
2914
3064
  max_date = converted_X[maybe_date_col].max()
@@ -2935,12 +3085,6 @@ class FeaturesEnricher(TransformerMixin):
2935
3085
 
2936
3086
  return df
2937
3087
 
2938
- @staticmethod
2939
- def _get_date_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
2940
- for col, t in search_keys.items():
2941
- if t in [SearchKey.DATE, SearchKey.DATETIME]:
2942
- return col
2943
-
2944
3088
  @staticmethod
2945
3089
  def _add_current_date_as_key(
2946
3090
  df: pd.DataFrame, search_keys: Dict[str, SearchKey], logger: logging.Logger, bundle: ResourceBundle
@@ -2956,7 +3100,7 @@ class FeaturesEnricher(TransformerMixin):
2956
3100
  logger.warning(msg)
2957
3101
  df[FeaturesEnricher.CURRENT_DATE] = datetime.date.today()
2958
3102
  search_keys[FeaturesEnricher.CURRENT_DATE] = SearchKey.DATE
2959
- converter = DateTimeSearchKeyConverter(FeaturesEnricher.CURRENT_DATE, None, logger, bundle)
3103
+ converter = DateTimeSearchKeyConverter(FeaturesEnricher.CURRENT_DATE, silent_mode=True)
2960
3104
  df = converter.convert(df)
2961
3105
  return df
2962
3106
 
@@ -2984,17 +3128,37 @@ class FeaturesEnricher(TransformerMixin):
2984
3128
  if len(cols) == 1:
2985
3129
  return cols[0]
2986
3130
 
3131
+ @staticmethod
3132
+ def _get_ip_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
3133
+ cols = [col for col, t in search_keys.items() if t == SearchKey.IP]
3134
+ if len(cols) > 1:
3135
+ raise Exception("More than one ip column found after unnest")
3136
+ if len(cols) == 1:
3137
+ return cols[0]
3138
+
2987
3139
  @staticmethod
2988
3140
  def _get_phone_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
2989
3141
  for col, t in search_keys.items():
2990
3142
  if t == SearchKey.PHONE:
2991
3143
  return col
2992
3144
 
3145
+ @staticmethod
3146
+ def _get_country_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
3147
+ for col, t in search_keys.items():
3148
+ if t == SearchKey.COUNTRY:
3149
+ return col
3150
+
3151
+ @staticmethod
3152
+ def _get_postal_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
3153
+ for col, t in search_keys.items():
3154
+ if t == SearchKey.POSTAL_CODE:
3155
+ return col
3156
+
2993
3157
  def _explode_multiple_search_keys(
2994
- self, df: pd.DataFrame, search_keys: Dict[str, SearchKey]
3158
+ self, df: pd.DataFrame, search_keys: Dict[str, SearchKey], columns_renaming: Dict[str, str]
2995
3159
  ) -> Tuple[pd.DataFrame, Dict[str, List[str]]]:
2996
3160
  # find groups of multiple search keys
2997
- search_key_names_by_type: Dict[SearchKey, str] = {}
3161
+ search_key_names_by_type: Dict[SearchKey, List[str]] = {}
2998
3162
  for key_name, key_type in search_keys.items():
2999
3163
  search_key_names_by_type[key_type] = search_key_names_by_type.get(key_type, []) + [key_name]
3000
3164
  search_key_names_by_type = {
@@ -3003,6 +3167,7 @@ class FeaturesEnricher(TransformerMixin):
3003
3167
  if len(search_key_names_by_type) == 0:
3004
3168
  return df, {}
3005
3169
 
3170
+ self.logger.info(f"Start exploding dataset by {search_key_names_by_type}. Size before: {len(df)}")
3006
3171
  multiple_keys_columns = [col for cols in search_key_names_by_type.values() for col in cols]
3007
3172
  other_columns = [col for col in df.columns if col not in multiple_keys_columns]
3008
3173
  exploded_dfs = []
@@ -3018,14 +3183,16 @@ class FeaturesEnricher(TransformerMixin):
3018
3183
  del search_keys[old_key]
3019
3184
  search_keys[new_search_key] = key_type
3020
3185
  unnest_search_keys[new_search_key] = key_names
3186
+ columns_renaming[new_search_key] = new_search_key
3021
3187
 
3022
3188
  df = pd.concat(exploded_dfs, ignore_index=True)
3189
+ self.logger.info(f"Finished explosion. Size after: {len(df)}")
3023
3190
  return df, unnest_search_keys
3024
3191
 
3025
3192
  def __add_fit_system_record_id(
3026
3193
  self,
3027
3194
  df: pd.DataFrame,
3028
- meaning_types: Dict[str, FileColumnMeaningType],
3195
+ # meaning_types: Dict[str, FileColumnMeaningType],
3029
3196
  search_keys: Dict[str, SearchKey],
3030
3197
  id_name: str,
3031
3198
  ) -> pd.DataFrame:
@@ -3048,39 +3215,37 @@ class FeaturesEnricher(TransformerMixin):
3048
3215
  ]
3049
3216
  if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
3050
3217
  date_column = DateTimeSearchKeyConverter.DATETIME_COL
3051
- sort_exclude_columns.append(self._get_date_column(search_keys))
3218
+ sort_exclude_columns.append(SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME]))
3052
3219
  else:
3053
- date_column = self._get_date_column(search_keys)
3220
+ date_column = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
3054
3221
  sort_columns = [date_column] if date_column is not None else []
3055
3222
 
3223
+ sorted_other_keys = sorted(search_keys, key=lambda x: str(search_keys.get(x)))
3224
+ sorted_other_keys = [k for k in sorted_other_keys if k not in sort_exclude_columns]
3225
+
3056
3226
  other_columns = sorted(
3057
3227
  [
3058
3228
  c
3059
3229
  for c in df.columns
3060
- if c not in sort_columns and c not in sort_exclude_columns and df[c].nunique() > 1
3230
+ if c not in sort_columns
3231
+ and c not in sorted_other_keys
3232
+ and c not in sort_exclude_columns
3233
+ and df[c].nunique() > 1
3061
3234
  ]
3062
- # [
3063
- # sk
3064
- # for sk, key_type in search_keys.items()
3065
- # if key_type not in [SearchKey.DATE, SearchKey.DATETIME]
3066
- # and sk in df.columns
3067
- # and df[sk].nunique() > 1 # don't use constant keys for hash
3068
- # ]
3069
3235
  )
3070
3236
 
3237
+ all_other_columns = sorted_other_keys + other_columns
3238
+
3071
3239
  search_keys_hash = "search_keys_hash"
3072
- if len(other_columns) > 0:
3240
+ if len(all_other_columns) > 0:
3073
3241
  sort_columns.append(search_keys_hash)
3074
- df[search_keys_hash] = pd.util.hash_pandas_object(df[other_columns], index=False)
3242
+ df[search_keys_hash] = pd.util.hash_pandas_object(df[all_other_columns], index=False)
3075
3243
 
3076
3244
  df = df.sort_values(by=sort_columns)
3077
3245
 
3078
3246
  if search_keys_hash in df.columns:
3079
3247
  df.drop(columns=search_keys_hash, inplace=True)
3080
3248
 
3081
- if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
3082
- df.drop(columns=DateTimeSearchKeyConverter.DATETIME_COL, inplace=True)
3083
-
3084
3249
  df = df.reset_index(drop=True).reset_index()
3085
3250
  # system_record_id saves correct order for fit
3086
3251
  df = df.rename(columns={DEFAULT_INDEX: id_name})
@@ -3090,11 +3255,11 @@ class FeaturesEnricher(TransformerMixin):
3090
3255
  df.index.name = original_index_name
3091
3256
  df = df.sort_values(by=original_order_name).drop(columns=original_order_name)
3092
3257
 
3093
- meaning_types[id_name] = (
3094
- FileColumnMeaningType.SYSTEM_RECORD_ID
3095
- if id_name == SYSTEM_RECORD_ID
3096
- else FileColumnMeaningType.ENTITY_SYSTEM_RECORD_ID
3097
- )
3258
+ # meaning_types[id_name] = (
3259
+ # FileColumnMeaningType.SYSTEM_RECORD_ID
3260
+ # if id_name == SYSTEM_RECORD_ID
3261
+ # else FileColumnMeaningType.ENTITY_SYSTEM_RECORD_ID
3262
+ # )
3098
3263
  return df
3099
3264
 
3100
3265
  def __correct_target(self, df: pd.DataFrame) -> pd.DataFrame:
@@ -3472,10 +3637,13 @@ class FeaturesEnricher(TransformerMixin):
3472
3637
  for _, key_type in search_keys.items():
3473
3638
  if not isinstance(key_type, SearchKey):
3474
3639
  raise ValidationError(self.bundle.get("unsupported_type_of_search_key").format(key_type))
3640
+
3475
3641
  valid_search_keys = {}
3476
3642
  unsupported_search_keys = {
3477
3643
  SearchKey.IP_RANGE_FROM,
3478
3644
  SearchKey.IP_RANGE_TO,
3645
+ SearchKey.IPV6_RANGE_FROM,
3646
+ SearchKey.IPV6_RANGE_TO,
3479
3647
  SearchKey.MSISDN_RANGE_FROM,
3480
3648
  SearchKey.MSISDN_RANGE_TO,
3481
3649
  # SearchKey.EMAIL_ONE_DOMAIN,
@@ -3565,6 +3733,7 @@ class FeaturesEnricher(TransformerMixin):
3565
3733
  print(msg)
3566
3734
  self.logger.warning(msg)
3567
3735
  self.warning_counter.increment()
3736
+ # TODO maybe raise ValidationError
3568
3737
 
3569
3738
  self.logger.info(f"Prepared search keys: {valid_search_keys}")
3570
3739