upgini 1.1.312__py3-none-any.whl → 1.1.312a2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- upgini/__about__.py +1 -1
- upgini/autofe/all_operands.py +7 -26
- upgini/autofe/binary.py +4 -95
- upgini/autofe/date.py +3 -16
- upgini/autofe/feature.py +11 -25
- upgini/autofe/unary.py +0 -7
- upgini/dataset.py +30 -385
- upgini/features_enricher.py +276 -120
- upgini/metadata.py +16 -1
- upgini/normalizer/normalize_utils.py +203 -0
- upgini/utils/country_utils.py +16 -0
- upgini/utils/datetime_utils.py +34 -15
- upgini/utils/email_utils.py +19 -5
- upgini/utils/ip_utils.py +100 -1
- upgini/utils/phone_utils.py +345 -0
- upgini/utils/postal_code_utils.py +34 -0
- {upgini-1.1.312.dist-info → upgini-1.1.312a2.dist-info}/METADATA +1 -3
- {upgini-1.1.312.dist-info → upgini-1.1.312a2.dist-info}/RECORD +20 -20
- {upgini-1.1.312.dist-info → upgini-1.1.312a2.dist-info}/WHEEL +1 -1
- upgini/normalizer/phone_normalizer.py +0 -340
- {upgini-1.1.312.dist-info → upgini-1.1.312a2.dist-info}/licenses/LICENSE +0 -0
upgini/features_enricher.py
CHANGED
|
@@ -61,11 +61,15 @@ from upgini.metadata import (
|
|
|
61
61
|
SearchKey,
|
|
62
62
|
)
|
|
63
63
|
from upgini.metrics import EstimatorWrapper, validate_scoring_argument
|
|
64
|
+
from upgini.normalizer.normalize_utils import Normalizer
|
|
64
65
|
from upgini.resource_bundle import ResourceBundle, bundle, get_custom_bundle
|
|
65
66
|
from upgini.search_task import SearchTask
|
|
66
67
|
from upgini.spinner import Spinner
|
|
67
68
|
from upgini.utils import combine_search_keys, find_numbers_with_decimal_comma
|
|
68
|
-
from upgini.utils.country_utils import
|
|
69
|
+
from upgini.utils.country_utils import (
|
|
70
|
+
CountrySearchKeyConverter,
|
|
71
|
+
CountrySearchKeyDetector,
|
|
72
|
+
)
|
|
69
73
|
from upgini.utils.custom_loss_utils import (
|
|
70
74
|
get_additional_params_custom_loss,
|
|
71
75
|
get_runtime_params_custom_loss,
|
|
@@ -90,8 +94,12 @@ from upgini.utils.display_utils import (
|
|
|
90
94
|
from upgini.utils.email_utils import EmailSearchKeyConverter, EmailSearchKeyDetector
|
|
91
95
|
from upgini.utils.features_validator import FeaturesValidator
|
|
92
96
|
from upgini.utils.format import Format
|
|
93
|
-
from upgini.utils.
|
|
94
|
-
from upgini.utils.
|
|
97
|
+
from upgini.utils.ip_utils import IpSearchKeyConverter
|
|
98
|
+
from upgini.utils.phone_utils import PhoneSearchKeyConverter, PhoneSearchKeyDetector
|
|
99
|
+
from upgini.utils.postal_code_utils import (
|
|
100
|
+
PostalCodeSearchKeyConverter,
|
|
101
|
+
PostalCodeSearchKeyDetector,
|
|
102
|
+
)
|
|
95
103
|
|
|
96
104
|
try:
|
|
97
105
|
from upgini.utils.progress_bar import CustomProgressBar as ProgressBar
|
|
@@ -237,6 +245,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
237
245
|
|
|
238
246
|
self.passed_features: List[str] = []
|
|
239
247
|
self.df_with_original_index: Optional[pd.DataFrame] = None
|
|
248
|
+
self.fit_columns_renaming: Optional[Dict[str, str]] = None
|
|
240
249
|
self.country_added = False
|
|
241
250
|
self.fit_generated_features: List[str] = []
|
|
242
251
|
self.fit_dropped_features: Set[str] = set()
|
|
@@ -247,7 +256,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
247
256
|
self.eval_set: Optional[List[Tuple]] = None
|
|
248
257
|
self.autodetected_search_keys: Dict[str, SearchKey] = {}
|
|
249
258
|
self.imbalanced = False
|
|
250
|
-
self.__cached_sampled_datasets: Optional[Tuple[pd.DataFrame, pd.DataFrame, pd.Series, Dict, Dict]] = None
|
|
259
|
+
self.__cached_sampled_datasets: Optional[Tuple[pd.DataFrame, pd.DataFrame, pd.Series, Dict, Dict, Dict]] = None
|
|
251
260
|
|
|
252
261
|
validate_version(self.logger)
|
|
253
262
|
self.search_keys = search_keys or {}
|
|
@@ -706,7 +715,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
706
715
|
|
|
707
716
|
start_time = time.time()
|
|
708
717
|
try:
|
|
709
|
-
result = self.__inner_transform(
|
|
718
|
+
result, _ = self.__inner_transform(
|
|
710
719
|
trace_id,
|
|
711
720
|
X,
|
|
712
721
|
exclude_features_sources=exclude_features_sources,
|
|
@@ -906,8 +915,16 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
906
915
|
search_keys,
|
|
907
916
|
groups,
|
|
908
917
|
_cv,
|
|
918
|
+
columns_renaming,
|
|
909
919
|
) = prepared_data
|
|
910
920
|
|
|
921
|
+
# rename cat_features
|
|
922
|
+
if cat_features:
|
|
923
|
+
for new_c, old_c in columns_renaming.items():
|
|
924
|
+
if old_c in cat_features:
|
|
925
|
+
cat_features.remove(old_c)
|
|
926
|
+
cat_features.append(new_c)
|
|
927
|
+
|
|
911
928
|
gc.collect()
|
|
912
929
|
|
|
913
930
|
print(self.bundle.get("metrics_start"))
|
|
@@ -920,7 +937,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
920
937
|
|
|
921
938
|
self._check_train_and_eval_target_distribution(y_sorted, fitting_eval_set_dict)
|
|
922
939
|
|
|
923
|
-
has_date =
|
|
940
|
+
has_date = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME]) is not None
|
|
924
941
|
model_task_type = self.model_task_type or define_task(y_sorted, has_date, self.logger, silent=True)
|
|
925
942
|
|
|
926
943
|
wrapper = EstimatorWrapper.create(
|
|
@@ -1113,7 +1130,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1113
1130
|
)
|
|
1114
1131
|
|
|
1115
1132
|
uplift_col = self.bundle.get("quality_metrics_uplift_header")
|
|
1116
|
-
date_column =
|
|
1133
|
+
date_column = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
|
|
1117
1134
|
if (
|
|
1118
1135
|
uplift_col in metrics_df.columns
|
|
1119
1136
|
and (metrics_df[uplift_col] < 0).any()
|
|
@@ -1201,9 +1218,11 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1201
1218
|
|
|
1202
1219
|
extended_X = x.copy()
|
|
1203
1220
|
generated_features = []
|
|
1204
|
-
date_column =
|
|
1221
|
+
date_column = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
|
|
1205
1222
|
if date_column is not None:
|
|
1206
|
-
converter = DateTimeSearchKeyConverter(
|
|
1223
|
+
converter = DateTimeSearchKeyConverter(
|
|
1224
|
+
date_column, self.date_format, self.logger, self.bundle, silent_mode=True
|
|
1225
|
+
)
|
|
1207
1226
|
extended_X = converter.convert(extended_X, keep_time=True)
|
|
1208
1227
|
generated_features.extend(converter.generated_features)
|
|
1209
1228
|
email_column = self._get_email_column(search_keys)
|
|
@@ -1259,7 +1278,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1259
1278
|
groups = None
|
|
1260
1279
|
|
|
1261
1280
|
if not isinstance(_cv, BaseCrossValidator):
|
|
1262
|
-
date_column =
|
|
1281
|
+
date_column = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
|
|
1263
1282
|
date_series = X[date_column] if date_column is not None else None
|
|
1264
1283
|
_cv, groups = CVConfig(
|
|
1265
1284
|
_cv, date_series, self.random_state, self._search_task.get_shuffle_kfold(), group_columns=group_columns
|
|
@@ -1282,7 +1301,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1282
1301
|
|
|
1283
1302
|
def _get_client_cat_features(
|
|
1284
1303
|
self, estimator: Optional[Any], X: pd.DataFrame, search_keys: Dict[str, SearchKey]
|
|
1285
|
-
) -> Optional[List[str]]:
|
|
1304
|
+
) -> Tuple[Optional[List[str]], List[str]]:
|
|
1286
1305
|
cat_features = None
|
|
1287
1306
|
search_keys_for_metrics = []
|
|
1288
1307
|
if (
|
|
@@ -1342,11 +1361,15 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1342
1361
|
progress_bar,
|
|
1343
1362
|
progress_callback,
|
|
1344
1363
|
)
|
|
1345
|
-
X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys = dataclasses.astuple(
|
|
1364
|
+
X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys, columns_renaming = dataclasses.astuple(
|
|
1365
|
+
sampled_data
|
|
1366
|
+
)
|
|
1346
1367
|
|
|
1347
1368
|
excluding_search_keys = list(search_keys.keys())
|
|
1348
1369
|
if search_keys_for_metrics is not None and len(search_keys_for_metrics) > 0:
|
|
1349
|
-
|
|
1370
|
+
for sk in excluding_search_keys:
|
|
1371
|
+
if columns_renaming.get(sk) in search_keys_for_metrics:
|
|
1372
|
+
excluding_search_keys.remove(sk)
|
|
1350
1373
|
|
|
1351
1374
|
client_features = [
|
|
1352
1375
|
c
|
|
@@ -1392,6 +1415,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1392
1415
|
fitting_X = fitting_X.drop(columns=constant_columns, errors="ignore")
|
|
1393
1416
|
fitting_enriched_X = fitting_enriched_X.drop(columns=constant_columns, errors="ignore")
|
|
1394
1417
|
|
|
1418
|
+
# TODO maybe there is no more need for these convertions
|
|
1395
1419
|
# Remove datetime features
|
|
1396
1420
|
datetime_features = [
|
|
1397
1421
|
f for f in fitting_X.columns if is_datetime64_any_dtype(fitting_X[f]) or is_period_dtype(fitting_X[f])
|
|
@@ -1479,6 +1503,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1479
1503
|
search_keys,
|
|
1480
1504
|
groups,
|
|
1481
1505
|
cv,
|
|
1506
|
+
columns_renaming,
|
|
1482
1507
|
)
|
|
1483
1508
|
|
|
1484
1509
|
@dataclass
|
|
@@ -1488,6 +1513,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1488
1513
|
enriched_X: pd.DataFrame
|
|
1489
1514
|
eval_set_sampled_dict: Dict[int, Tuple[pd.DataFrame, pd.Series]]
|
|
1490
1515
|
search_keys: Dict[str, SearchKey]
|
|
1516
|
+
columns_renaming: Dict[str, str]
|
|
1491
1517
|
|
|
1492
1518
|
def _sample_data_for_metrics(
|
|
1493
1519
|
self,
|
|
@@ -1527,11 +1553,15 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1527
1553
|
)
|
|
1528
1554
|
|
|
1529
1555
|
def __get_sampled_cached_enriched(self, exclude_features_sources: Optional[List[str]]) -> _SampledDataForMetrics:
|
|
1530
|
-
X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys =
|
|
1556
|
+
X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys, columns_renaming = (
|
|
1557
|
+
self.__cached_sampled_datasets
|
|
1558
|
+
)
|
|
1531
1559
|
if exclude_features_sources:
|
|
1532
1560
|
enriched_X = enriched_X.drop(columns=exclude_features_sources, errors="ignore")
|
|
1533
1561
|
|
|
1534
|
-
return self.__mk_sampled_data_tuple(
|
|
1562
|
+
return self.__mk_sampled_data_tuple(
|
|
1563
|
+
X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys, columns_renaming
|
|
1564
|
+
)
|
|
1535
1565
|
|
|
1536
1566
|
def __sample_only_input(
|
|
1537
1567
|
self, validated_X: pd.DataFrame, validated_y: pd.Series, eval_set: Optional[List[tuple]], is_demo_dataset: bool
|
|
@@ -1549,6 +1579,22 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1549
1579
|
eval_xy[EVAL_SET_INDEX] = idx + 1
|
|
1550
1580
|
df = pd.concat([df, eval_xy])
|
|
1551
1581
|
|
|
1582
|
+
search_keys = self.search_keys.copy()
|
|
1583
|
+
search_keys = self.__prepare_search_keys(df, search_keys, is_demo_dataset, is_transform=True, silent_mode=True)
|
|
1584
|
+
|
|
1585
|
+
date_column = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
|
|
1586
|
+
generated_features = []
|
|
1587
|
+
if date_column is not None:
|
|
1588
|
+
converter = DateTimeSearchKeyConverter(
|
|
1589
|
+
date_column, self.date_format, self.logger, self.bundle, silent_mode=True
|
|
1590
|
+
)
|
|
1591
|
+
df = converter.convert(df, keep_time=True)
|
|
1592
|
+
generated_features = converter.generated_features
|
|
1593
|
+
|
|
1594
|
+
normalizer = Normalizer(self.search_keys, generated_features, self.bundle, self.logger, self.warning_counter)
|
|
1595
|
+
df = normalizer.normalize(df)
|
|
1596
|
+
columns_renaming = normalizer.columns_renaming
|
|
1597
|
+
|
|
1552
1598
|
df = clean_full_duplicates(df, logger=self.logger, silent=True, bundle=self.bundle)
|
|
1553
1599
|
|
|
1554
1600
|
num_samples = _num_samples(df)
|
|
@@ -1561,24 +1607,41 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1561
1607
|
self.logger.info(f"Downsampling from {num_samples} to {sample_rows}")
|
|
1562
1608
|
df = df.sample(n=sample_rows, random_state=self.random_state)
|
|
1563
1609
|
|
|
1564
|
-
|
|
1565
|
-
|
|
1610
|
+
email_column = self._get_email_column(search_keys)
|
|
1611
|
+
hem_column = self._get_hem_column(search_keys)
|
|
1612
|
+
if email_column:
|
|
1613
|
+
converter = EmailSearchKeyConverter(
|
|
1614
|
+
email_column, hem_column, search_keys, columns_renaming, [], self.bundle, self.logger
|
|
1615
|
+
)
|
|
1616
|
+
df = converter.convert(df)
|
|
1617
|
+
df = self.__add_fit_system_record_id(df, search_keys, SYSTEM_RECORD_ID)
|
|
1618
|
+
if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
|
|
1619
|
+
df = df.drop(columns=DateTimeSearchKeyConverter.DATETIME_COL)
|
|
1566
1620
|
|
|
1567
|
-
train_df =
|
|
1621
|
+
train_df = df.query(f"{EVAL_SET_INDEX} == 0") if eval_set is not None else df
|
|
1568
1622
|
X_sampled = train_df.drop(columns=[TARGET, EVAL_SET_INDEX], errors="ignore")
|
|
1569
1623
|
y_sampled = train_df[TARGET].copy()
|
|
1570
1624
|
enriched_X = X_sampled
|
|
1571
1625
|
|
|
1572
1626
|
if eval_set is not None:
|
|
1573
1627
|
for idx in range(len(eval_set)):
|
|
1574
|
-
eval_xy_sampled =
|
|
1628
|
+
eval_xy_sampled = df.query(f"{EVAL_SET_INDEX} == {idx + 1}")
|
|
1575
1629
|
eval_X_sampled = eval_xy_sampled.drop(columns=[TARGET, EVAL_SET_INDEX], errors="ignore")
|
|
1576
1630
|
eval_y_sampled = eval_xy_sampled[TARGET].copy()
|
|
1577
1631
|
enriched_eval_X = eval_X_sampled
|
|
1578
1632
|
eval_set_sampled_dict[idx] = (eval_X_sampled, enriched_eval_X, eval_y_sampled)
|
|
1579
|
-
self.__cached_sampled_datasets = (
|
|
1633
|
+
self.__cached_sampled_datasets = (
|
|
1634
|
+
X_sampled,
|
|
1635
|
+
y_sampled,
|
|
1636
|
+
enriched_X,
|
|
1637
|
+
eval_set_sampled_dict,
|
|
1638
|
+
search_keys,
|
|
1639
|
+
columns_renaming,
|
|
1640
|
+
)
|
|
1580
1641
|
|
|
1581
|
-
return self.__mk_sampled_data_tuple(
|
|
1642
|
+
return self.__mk_sampled_data_tuple(
|
|
1643
|
+
X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys, columns_renaming
|
|
1644
|
+
)
|
|
1582
1645
|
|
|
1583
1646
|
def __sample_balanced(
|
|
1584
1647
|
self,
|
|
@@ -1590,7 +1653,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1590
1653
|
search_keys = self.fit_search_keys
|
|
1591
1654
|
|
|
1592
1655
|
rows_to_drop = None
|
|
1593
|
-
has_date =
|
|
1656
|
+
has_date = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME]) is not None
|
|
1594
1657
|
task_type = self.model_task_type or define_task(
|
|
1595
1658
|
self.df_with_original_index[TARGET], has_date, self.logger, silent=True
|
|
1596
1659
|
)
|
|
@@ -1644,9 +1707,18 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1644
1707
|
enriched_eval_X = enriched_eval_sets[idx + 1].drop(columns=[TARGET, EVAL_SET_INDEX])
|
|
1645
1708
|
eval_set_sampled_dict[idx] = (eval_X_sampled, enriched_eval_X, eval_y_sampled)
|
|
1646
1709
|
|
|
1647
|
-
self.__cached_sampled_datasets = (
|
|
1710
|
+
self.__cached_sampled_datasets = (
|
|
1711
|
+
X_sampled,
|
|
1712
|
+
y_sampled,
|
|
1713
|
+
enriched_X,
|
|
1714
|
+
eval_set_sampled_dict,
|
|
1715
|
+
search_keys,
|
|
1716
|
+
self.fit_columns_renaming,
|
|
1717
|
+
)
|
|
1648
1718
|
|
|
1649
|
-
return self.__mk_sampled_data_tuple(
|
|
1719
|
+
return self.__mk_sampled_data_tuple(
|
|
1720
|
+
X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys, self.fit_columns_renaming
|
|
1721
|
+
)
|
|
1650
1722
|
|
|
1651
1723
|
def __sample_imbalanced(
|
|
1652
1724
|
self,
|
|
@@ -1686,7 +1758,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1686
1758
|
tmp_target_name = "__target"
|
|
1687
1759
|
df = df.rename(columns={TARGET: tmp_target_name})
|
|
1688
1760
|
|
|
1689
|
-
enriched_df = self.__inner_transform(
|
|
1761
|
+
enriched_df, columns_renaming = self.__inner_transform(
|
|
1690
1762
|
trace_id,
|
|
1691
1763
|
df,
|
|
1692
1764
|
exclude_features_sources=exclude_features_sources,
|
|
@@ -1734,7 +1806,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1734
1806
|
tmp_target_name = "__target"
|
|
1735
1807
|
df = df.rename(columns={TARGET: tmp_target_name})
|
|
1736
1808
|
|
|
1737
|
-
enriched_Xy = self.__inner_transform(
|
|
1809
|
+
enriched_Xy, columns_renaming = self.__inner_transform(
|
|
1738
1810
|
trace_id,
|
|
1739
1811
|
df,
|
|
1740
1812
|
exclude_features_sources=exclude_features_sources,
|
|
@@ -1759,9 +1831,18 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1759
1831
|
y_sampled = enriched_Xy[TARGET].copy()
|
|
1760
1832
|
enriched_X = enriched_Xy.drop(columns=TARGET)
|
|
1761
1833
|
|
|
1762
|
-
self.__cached_sampled_datasets = (
|
|
1834
|
+
self.__cached_sampled_datasets = (
|
|
1835
|
+
X_sampled,
|
|
1836
|
+
y_sampled,
|
|
1837
|
+
enriched_X,
|
|
1838
|
+
eval_set_sampled_dict,
|
|
1839
|
+
self.search_keys,
|
|
1840
|
+
columns_renaming,
|
|
1841
|
+
)
|
|
1763
1842
|
|
|
1764
|
-
return self.__mk_sampled_data_tuple(
|
|
1843
|
+
return self.__mk_sampled_data_tuple(
|
|
1844
|
+
X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, self.search_keys, columns_renaming
|
|
1845
|
+
)
|
|
1765
1846
|
|
|
1766
1847
|
def __mk_sampled_data_tuple(
|
|
1767
1848
|
self,
|
|
@@ -1770,6 +1851,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1770
1851
|
enriched_X: pd.DataFrame,
|
|
1771
1852
|
eval_set_sampled_dict: Dict,
|
|
1772
1853
|
search_keys: Dict,
|
|
1854
|
+
columns_renaming: Dict[str, str],
|
|
1773
1855
|
):
|
|
1774
1856
|
search_keys = {k: v for k, v in search_keys.items() if k in X_sampled.columns.to_list()}
|
|
1775
1857
|
return FeaturesEnricher._SampledDataForMetrics(
|
|
@@ -1778,6 +1860,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1778
1860
|
enriched_X=enriched_X,
|
|
1779
1861
|
eval_set_sampled_dict=eval_set_sampled_dict,
|
|
1780
1862
|
search_keys=search_keys,
|
|
1863
|
+
columns_renaming=columns_renaming,
|
|
1781
1864
|
)
|
|
1782
1865
|
|
|
1783
1866
|
def get_search_id(self) -> Optional[str]:
|
|
@@ -1866,7 +1949,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1866
1949
|
progress_bar: Optional[ProgressBar] = None,
|
|
1867
1950
|
progress_callback: Optional[Callable[[SearchProgress], Any]] = None,
|
|
1868
1951
|
add_fit_system_record_id: bool = False,
|
|
1869
|
-
) -> pd.DataFrame:
|
|
1952
|
+
) -> Tuple[pd.DataFrame, Dict[str, str]]:
|
|
1870
1953
|
if self._search_task is None:
|
|
1871
1954
|
raise NotFittedError(self.bundle.get("transform_unfitted_enricher"))
|
|
1872
1955
|
|
|
@@ -1879,13 +1962,13 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1879
1962
|
|
|
1880
1963
|
if len(self.feature_names_) == 0:
|
|
1881
1964
|
self.logger.warning(self.bundle.get("no_important_features_for_transform"))
|
|
1882
|
-
return X
|
|
1965
|
+
return X, {c: c for c in X.columns}
|
|
1883
1966
|
|
|
1884
1967
|
if self._has_paid_features(exclude_features_sources):
|
|
1885
1968
|
msg = self.bundle.get("transform_with_paid_features")
|
|
1886
1969
|
self.logger.warning(msg)
|
|
1887
1970
|
self.__display_support_link(msg)
|
|
1888
|
-
return None
|
|
1971
|
+
return None, {c: c for c in X.columns}
|
|
1889
1972
|
|
|
1890
1973
|
if not metrics_calculation:
|
|
1891
1974
|
transform_usage = self.rest_client.get_current_transform_usage(trace_id)
|
|
@@ -1896,7 +1979,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1896
1979
|
self.logger.warning(msg)
|
|
1897
1980
|
print(msg)
|
|
1898
1981
|
show_request_quote_button()
|
|
1899
|
-
return None
|
|
1982
|
+
return None, {c: c for c in X.columns}
|
|
1900
1983
|
else:
|
|
1901
1984
|
msg = self.bundle.get("transform_usage_info").format(
|
|
1902
1985
|
transform_usage.limit, transform_usage.transformed_rows
|
|
@@ -1934,9 +2017,11 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1934
2017
|
df = self.__add_country_code(df, search_keys)
|
|
1935
2018
|
|
|
1936
2019
|
generated_features = []
|
|
1937
|
-
date_column =
|
|
2020
|
+
date_column = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
|
|
1938
2021
|
if date_column is not None:
|
|
1939
|
-
converter = DateTimeSearchKeyConverter(
|
|
2022
|
+
converter = DateTimeSearchKeyConverter(
|
|
2023
|
+
date_column, self.date_format, self.logger, bundle=self.bundle, silent_mode=silent_mode
|
|
2024
|
+
)
|
|
1940
2025
|
df = converter.convert(df)
|
|
1941
2026
|
self.logger.info(f"Date column after convertion: {df[date_column]}")
|
|
1942
2027
|
generated_features.extend(converter.generated_features)
|
|
@@ -1945,61 +2030,87 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1945
2030
|
if self.add_date_if_missing:
|
|
1946
2031
|
df = self._add_current_date_as_key(df, search_keys, self.logger, self.bundle)
|
|
1947
2032
|
|
|
2033
|
+
normalizer = Normalizer(
|
|
2034
|
+
search_keys, generated_features, self.bundle, self.logger, self.warning_counter, silent_mode
|
|
2035
|
+
)
|
|
2036
|
+
df = normalizer.normalize(df)
|
|
2037
|
+
columns_renaming = normalizer.columns_renaming
|
|
2038
|
+
|
|
1948
2039
|
# Don't pass all features in backend on transform
|
|
1949
|
-
original_features_for_transform = []
|
|
1950
2040
|
runtime_parameters = self._get_copy_of_runtime_parameters()
|
|
1951
|
-
|
|
1952
|
-
if len(
|
|
1953
|
-
|
|
1954
|
-
features_for_transform = self._search_task.get_features_for_transform()
|
|
1955
|
-
if features_for_transform is not None and len(features_for_transform) > 0:
|
|
1956
|
-
file_metadata = self._search_task.get_file_metadata(trace_id)
|
|
1957
|
-
original_features_for_transform = [
|
|
1958
|
-
c.originalName or c.name for c in file_metadata.columns if c.name in features_for_transform
|
|
1959
|
-
]
|
|
1960
|
-
|
|
1961
|
-
runtime_parameters.properties["features_for_embeddings"] = ",".join(features_for_transform)
|
|
2041
|
+
features_for_transform = self._search_task.get_features_for_transform() or []
|
|
2042
|
+
if len(features_for_transform) > 0:
|
|
2043
|
+
runtime_parameters.properties["features_for_embeddings"] = ",".join(features_for_transform)
|
|
1962
2044
|
|
|
1963
|
-
columns_for_system_record_id = sorted(list(search_keys.keys()) +
|
|
2045
|
+
columns_for_system_record_id = sorted(list(search_keys.keys()) + features_for_transform)
|
|
1964
2046
|
|
|
1965
2047
|
df[ENTITY_SYSTEM_RECORD_ID] = pd.util.hash_pandas_object(
|
|
1966
2048
|
df[columns_for_system_record_id], index=False
|
|
1967
2049
|
).astype("Float64")
|
|
1968
2050
|
|
|
1969
2051
|
# Explode multiple search keys
|
|
1970
|
-
df, unnest_search_keys = self._explode_multiple_search_keys(df, search_keys)
|
|
2052
|
+
df, unnest_search_keys = self._explode_multiple_search_keys(df, search_keys, columns_renaming)
|
|
1971
2053
|
|
|
1972
2054
|
email_column = self._get_email_column(search_keys)
|
|
1973
2055
|
hem_column = self._get_hem_column(search_keys)
|
|
1974
|
-
email_converted_to_hem = False
|
|
2056
|
+
# email_converted_to_hem = False
|
|
1975
2057
|
if email_column:
|
|
1976
2058
|
converter = EmailSearchKeyConverter(
|
|
1977
|
-
email_column,
|
|
2059
|
+
email_column,
|
|
2060
|
+
hem_column,
|
|
2061
|
+
search_keys,
|
|
2062
|
+
columns_renaming,
|
|
2063
|
+
list(unnest_search_keys.keys()),
|
|
2064
|
+
self.logger,
|
|
1978
2065
|
)
|
|
1979
2066
|
df = converter.convert(df)
|
|
1980
2067
|
generated_features.extend(converter.generated_features)
|
|
1981
|
-
|
|
2068
|
+
|
|
2069
|
+
ip_column = self._get_ip_column(search_keys)
|
|
2070
|
+
if ip_column:
|
|
2071
|
+
converter = IpSearchKeyConverter(
|
|
2072
|
+
ip_column,
|
|
2073
|
+
search_keys,
|
|
2074
|
+
columns_renaming,
|
|
2075
|
+
list(unnest_search_keys.keys()),
|
|
2076
|
+
self.bundle,
|
|
2077
|
+
self.logger,
|
|
2078
|
+
)
|
|
2079
|
+
df = converter.convert(df)
|
|
2080
|
+
|
|
2081
|
+
phone_column = self._get_phone_column(search_keys)
|
|
2082
|
+
country_column = self._get_country_column(search_keys)
|
|
2083
|
+
if phone_column:
|
|
2084
|
+
converter = PhoneSearchKeyConverter(phone_column, country_column)
|
|
2085
|
+
df = converter.convert(df)
|
|
2086
|
+
|
|
2087
|
+
if country_column:
|
|
2088
|
+
converter = CountrySearchKeyConverter(country_column)
|
|
2089
|
+
df = converter.convert(df)
|
|
2090
|
+
|
|
2091
|
+
postal_code = self._get_postal_column(search_keys)
|
|
2092
|
+
if postal_code:
|
|
2093
|
+
converter = PostalCodeSearchKeyConverter(postal_code)
|
|
2094
|
+
df = converter.convert(df)
|
|
2095
|
+
|
|
1982
2096
|
generated_features = [f for f in generated_features if f in self.fit_generated_features]
|
|
1983
2097
|
|
|
1984
2098
|
meaning_types = {col: key.value for col, key in search_keys.items()}
|
|
1985
|
-
|
|
1986
|
-
for col in original_features_for_transform:
|
|
2099
|
+
for col in features_for_transform:
|
|
1987
2100
|
meaning_types[col] = FileColumnMeaningType.FEATURE
|
|
1988
|
-
features_not_to_pass = [
|
|
1989
|
-
|
|
1990
|
-
|
|
1991
|
-
features_not_to_pass.append(email_column)
|
|
1992
|
-
|
|
1993
|
-
features_not_to_pass = [c for c in features_not_to_pass if c not in original_features_for_transform]
|
|
1994
|
-
columns_for_system_record_id = sorted(list(search_keys.keys()) + (original_features_for_transform))
|
|
2101
|
+
features_not_to_pass = [
|
|
2102
|
+
c for c in df.columns if c not in search_keys.keys() and c not in features_for_transform and c != ENTITY_SYSTEM_RECORD_ID
|
|
2103
|
+
]
|
|
1995
2104
|
|
|
1996
2105
|
if add_fit_system_record_id:
|
|
1997
|
-
df = self.__add_fit_system_record_id(df,
|
|
2106
|
+
df = self.__add_fit_system_record_id(df, search_keys, SYSTEM_RECORD_ID)
|
|
2107
|
+
if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
|
|
2108
|
+
df = df.drop(columns=DateTimeSearchKeyConverter.DATETIME_COL)
|
|
1998
2109
|
df = df.rename(columns={SYSTEM_RECORD_ID: SORT_ID})
|
|
1999
2110
|
features_not_to_pass.append(SORT_ID)
|
|
2000
2111
|
|
|
2001
|
-
|
|
2002
|
-
|
|
2112
|
+
# search keys might be changed after explode
|
|
2113
|
+
columns_for_system_record_id = sorted(list(search_keys.keys()) + features_for_transform)
|
|
2003
2114
|
df[SYSTEM_RECORD_ID] = pd.util.hash_pandas_object(df[columns_for_system_record_id], index=False).astype(
|
|
2004
2115
|
"Float64"
|
|
2005
2116
|
)
|
|
@@ -2035,8 +2146,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2035
2146
|
rest_client=self.rest_client,
|
|
2036
2147
|
logger=self.logger,
|
|
2037
2148
|
)
|
|
2038
|
-
|
|
2039
|
-
dataset.ignore_columns = [email_column]
|
|
2149
|
+
dataset.columns_renaming = columns_renaming
|
|
2040
2150
|
|
|
2041
2151
|
if max_features is not None or importance_threshold is not None:
|
|
2042
2152
|
exclude_features_sources = list(
|
|
@@ -2138,7 +2248,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2138
2248
|
if add_fit_system_record_id:
|
|
2139
2249
|
result = result.rename(columns={SORT_ID: SYSTEM_RECORD_ID})
|
|
2140
2250
|
|
|
2141
|
-
return result
|
|
2251
|
+
return result, columns_renaming
|
|
2142
2252
|
|
|
2143
2253
|
def _get_excluded_features(self, max_features: Optional[int], importance_threshold: Optional[float]) -> List[str]:
|
|
2144
2254
|
features_info = self._internal_features_info
|
|
@@ -2239,6 +2349,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2239
2349
|
self.df_with_original_index = None
|
|
2240
2350
|
self.__cached_sampled_datasets = None
|
|
2241
2351
|
self.metrics = None
|
|
2352
|
+
self.fit_columns_renaming = None
|
|
2353
|
+
self.fit_dropped_features = set()
|
|
2354
|
+
self.fit_generated_features = []
|
|
2242
2355
|
|
|
2243
2356
|
validated_X = self._validate_X(X)
|
|
2244
2357
|
validated_y = self._validate_y(validated_X, y)
|
|
@@ -2285,9 +2398,10 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2285
2398
|
self.fit_search_keys = self.search_keys.copy()
|
|
2286
2399
|
self.fit_search_keys = self.__prepare_search_keys(validated_X, self.fit_search_keys, is_demo_dataset)
|
|
2287
2400
|
|
|
2288
|
-
maybe_date_column =
|
|
2401
|
+
maybe_date_column = SearchKey.find_key(self.fit_search_keys, [SearchKey.DATE, SearchKey.DATETIME])
|
|
2289
2402
|
has_date = maybe_date_column is not None
|
|
2290
2403
|
model_task_type = self.model_task_type or define_task(validated_y, has_date, self.logger)
|
|
2404
|
+
|
|
2291
2405
|
self._validate_binary_observations(validated_y, model_task_type)
|
|
2292
2406
|
|
|
2293
2407
|
self.runtime_parameters = get_runtime_params_custom_loss(
|
|
@@ -2317,7 +2431,13 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2317
2431
|
self.fit_generated_features = []
|
|
2318
2432
|
|
|
2319
2433
|
if has_date:
|
|
2320
|
-
converter = DateTimeSearchKeyConverter(
|
|
2434
|
+
converter = DateTimeSearchKeyConverter(
|
|
2435
|
+
maybe_date_column,
|
|
2436
|
+
self.date_format,
|
|
2437
|
+
self.logger,
|
|
2438
|
+
bundle=self.bundle,
|
|
2439
|
+
warnings_counter=self.warning_counter,
|
|
2440
|
+
)
|
|
2321
2441
|
df = converter.convert(df, keep_time=True)
|
|
2322
2442
|
self.logger.info(f"Date column after convertion: {df[maybe_date_column]}")
|
|
2323
2443
|
self.fit_generated_features.extend(converter.generated_features)
|
|
@@ -2334,7 +2454,12 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2334
2454
|
|
|
2335
2455
|
self.__adjust_cv(df, maybe_date_column, model_task_type)
|
|
2336
2456
|
|
|
2337
|
-
|
|
2457
|
+
normalizer = Normalizer(
|
|
2458
|
+
self.fit_search_keys, self.fit_generated_features, self.bundle, self.logger, self.warning_counter
|
|
2459
|
+
)
|
|
2460
|
+
df = normalizer.normalize(df)
|
|
2461
|
+
columns_renaming = normalizer.columns_renaming
|
|
2462
|
+
self.fit_columns_renaming = columns_renaming
|
|
2338
2463
|
|
|
2339
2464
|
df = remove_fintech_duplicates(
|
|
2340
2465
|
df, self.fit_search_keys, date_format=self.date_format, logger=self.logger, bundle=self.bundle
|
|
@@ -2342,38 +2467,59 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2342
2467
|
df = clean_full_duplicates(df, self.logger, bundle=self.bundle)
|
|
2343
2468
|
|
|
2344
2469
|
# Explode multiple search keys
|
|
2345
|
-
|
|
2346
|
-
meaning_types = {
|
|
2347
|
-
**{col: key.value for col, key in self.fit_search_keys.items()},
|
|
2348
|
-
**{str(c): FileColumnMeaningType.FEATURE for c in df.columns if c not in non_feature_columns},
|
|
2349
|
-
}
|
|
2350
|
-
meaning_types[self.TARGET_NAME] = FileColumnMeaningType.TARGET
|
|
2351
|
-
if eval_set is not None and len(eval_set) > 0:
|
|
2352
|
-
meaning_types[EVAL_SET_INDEX] = FileColumnMeaningType.EVAL_SET_INDEX
|
|
2353
|
-
df = self.__add_fit_system_record_id(df, meaning_types, self.fit_search_keys, ENTITY_SYSTEM_RECORD_ID)
|
|
2470
|
+
df = self.__add_fit_system_record_id(df, self.fit_search_keys, ENTITY_SYSTEM_RECORD_ID)
|
|
2354
2471
|
|
|
2355
2472
|
# TODO check that this is correct for enrichment
|
|
2356
2473
|
self.df_with_original_index = df.copy()
|
|
2474
|
+
# TODO check maybe need to drop _time column from df_with_original_index
|
|
2357
2475
|
|
|
2358
|
-
df, unnest_search_keys = self._explode_multiple_search_keys(df, self.fit_search_keys)
|
|
2476
|
+
df, unnest_search_keys = self._explode_multiple_search_keys(df, self.fit_search_keys, columns_renaming)
|
|
2359
2477
|
|
|
2360
2478
|
# Convert EMAIL to HEM after unnesting to do it only with one column
|
|
2361
2479
|
email_column = self._get_email_column(self.fit_search_keys)
|
|
2362
2480
|
hem_column = self._get_hem_column(self.fit_search_keys)
|
|
2363
|
-
email_converted_to_hem = False
|
|
2364
2481
|
if email_column:
|
|
2365
2482
|
converter = EmailSearchKeyConverter(
|
|
2366
|
-
email_column,
|
|
2483
|
+
email_column,
|
|
2484
|
+
hem_column,
|
|
2485
|
+
self.fit_search_keys,
|
|
2486
|
+
columns_renaming,
|
|
2487
|
+
list(unnest_search_keys.keys()),
|
|
2488
|
+
self.logger,
|
|
2367
2489
|
)
|
|
2368
2490
|
df = converter.convert(df)
|
|
2369
2491
|
self.fit_generated_features.extend(converter.generated_features)
|
|
2370
|
-
|
|
2492
|
+
|
|
2493
|
+
ip_column = self._get_ip_column(self.fit_search_keys)
|
|
2494
|
+
if ip_column:
|
|
2495
|
+
converter = IpSearchKeyConverter(
|
|
2496
|
+
ip_column,
|
|
2497
|
+
self.fit_search_keys,
|
|
2498
|
+
columns_renaming,
|
|
2499
|
+
list(unnest_search_keys.keys()),
|
|
2500
|
+
self.bundle,
|
|
2501
|
+
self.logger,
|
|
2502
|
+
)
|
|
2503
|
+
df = converter.convert(df)
|
|
2504
|
+
|
|
2505
|
+
phone_column = self._get_phone_column(self.fit_search_keys)
|
|
2506
|
+
country_column = self._get_country_column(self.fit_search_keys)
|
|
2507
|
+
if phone_column:
|
|
2508
|
+
converter = PhoneSearchKeyConverter(phone_column, country_column)
|
|
2509
|
+
df = converter.convert(df)
|
|
2510
|
+
|
|
2511
|
+
if country_column:
|
|
2512
|
+
converter = CountrySearchKeyConverter(country_column)
|
|
2513
|
+
df = converter.convert(df)
|
|
2514
|
+
|
|
2515
|
+
postal_code = self._get_postal_column(self.fit_search_keys)
|
|
2516
|
+
if postal_code:
|
|
2517
|
+
converter = PostalCodeSearchKeyConverter(postal_code)
|
|
2518
|
+
df = converter.convert(df)
|
|
2371
2519
|
|
|
2372
2520
|
non_feature_columns = [self.TARGET_NAME, EVAL_SET_INDEX, ENTITY_SYSTEM_RECORD_ID, SEARCH_KEY_UNNEST] + list(
|
|
2373
2521
|
self.fit_search_keys.keys()
|
|
2374
2522
|
)
|
|
2375
|
-
if email_converted_to_hem:
|
|
2376
|
-
non_feature_columns.append(email_column)
|
|
2377
2523
|
if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
|
|
2378
2524
|
non_feature_columns.append(DateTimeSearchKeyConverter.DATETIME_COL)
|
|
2379
2525
|
|
|
@@ -2385,9 +2531,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2385
2531
|
self.fit_dropped_features.update(features_to_drop)
|
|
2386
2532
|
df = df.drop(columns=features_to_drop)
|
|
2387
2533
|
|
|
2388
|
-
if email_converted_to_hem:
|
|
2389
|
-
self.fit_dropped_features.add(email_column)
|
|
2390
|
-
|
|
2391
2534
|
self.fit_generated_features = [f for f in self.fit_generated_features if f not in self.fit_dropped_features]
|
|
2392
2535
|
|
|
2393
2536
|
meaning_types = {
|
|
@@ -2401,7 +2544,12 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2401
2544
|
if eval_set is not None and len(eval_set) > 0:
|
|
2402
2545
|
meaning_types[EVAL_SET_INDEX] = FileColumnMeaningType.EVAL_SET_INDEX
|
|
2403
2546
|
|
|
2404
|
-
df = self.__add_fit_system_record_id(df,
|
|
2547
|
+
df = self.__add_fit_system_record_id(df, self.fit_search_keys, SYSTEM_RECORD_ID)
|
|
2548
|
+
|
|
2549
|
+
if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
|
|
2550
|
+
df = df.drop(columns=DateTimeSearchKeyConverter.DATETIME_COL)
|
|
2551
|
+
|
|
2552
|
+
meaning_types[SYSTEM_RECORD_ID] = FileColumnMeaningType.SYSTEM_RECORD_ID
|
|
2405
2553
|
|
|
2406
2554
|
df = df.reset_index(drop=True).sort_values(by=SYSTEM_RECORD_ID).reset_index(drop=True)
|
|
2407
2555
|
|
|
@@ -2419,8 +2567,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2419
2567
|
rest_client=self.rest_client,
|
|
2420
2568
|
logger=self.logger,
|
|
2421
2569
|
)
|
|
2422
|
-
|
|
2423
|
-
dataset.ignore_columns = [email_column]
|
|
2570
|
+
dataset.columns_renaming = columns_renaming
|
|
2424
2571
|
|
|
2425
2572
|
self.passed_features = [
|
|
2426
2573
|
column for column, meaning_type in meaning_types.items() if meaning_type == FileColumnMeaningType.FEATURE
|
|
@@ -2809,7 +2956,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2809
2956
|
if DateTimeSearchKeyConverter.DATETIME_COL in X.columns:
|
|
2810
2957
|
date_column = DateTimeSearchKeyConverter.DATETIME_COL
|
|
2811
2958
|
else:
|
|
2812
|
-
date_column =
|
|
2959
|
+
date_column = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
|
|
2813
2960
|
sort_columns = [date_column] if date_column is not None else []
|
|
2814
2961
|
|
|
2815
2962
|
# Xy = pd.concat([X, y], axis=1)
|
|
@@ -2905,10 +3052,10 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2905
3052
|
|
|
2906
3053
|
do_without_pandas_limits(print_datasets_sample)
|
|
2907
3054
|
|
|
2908
|
-
maybe_date_col =
|
|
3055
|
+
maybe_date_col = SearchKey.find_key(self.search_keys, [SearchKey.DATE, SearchKey.DATETIME])
|
|
2909
3056
|
if X is not None and maybe_date_col is not None and maybe_date_col in X.columns:
|
|
2910
3057
|
# TODO cast date column to single dtype
|
|
2911
|
-
date_converter = DateTimeSearchKeyConverter(maybe_date_col, self.date_format)
|
|
3058
|
+
date_converter = DateTimeSearchKeyConverter(maybe_date_col, self.date_format, silent_mode=True)
|
|
2912
3059
|
converted_X = date_converter.convert(X)
|
|
2913
3060
|
min_date = converted_X[maybe_date_col].min()
|
|
2914
3061
|
max_date = converted_X[maybe_date_col].max()
|
|
@@ -2935,12 +3082,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2935
3082
|
|
|
2936
3083
|
return df
|
|
2937
3084
|
|
|
2938
|
-
@staticmethod
|
|
2939
|
-
def _get_date_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
|
|
2940
|
-
for col, t in search_keys.items():
|
|
2941
|
-
if t in [SearchKey.DATE, SearchKey.DATETIME]:
|
|
2942
|
-
return col
|
|
2943
|
-
|
|
2944
3085
|
@staticmethod
|
|
2945
3086
|
def _add_current_date_as_key(
|
|
2946
3087
|
df: pd.DataFrame, search_keys: Dict[str, SearchKey], logger: logging.Logger, bundle: ResourceBundle
|
|
@@ -2956,7 +3097,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2956
3097
|
logger.warning(msg)
|
|
2957
3098
|
df[FeaturesEnricher.CURRENT_DATE] = datetime.date.today()
|
|
2958
3099
|
search_keys[FeaturesEnricher.CURRENT_DATE] = SearchKey.DATE
|
|
2959
|
-
converter = DateTimeSearchKeyConverter(FeaturesEnricher.CURRENT_DATE,
|
|
3100
|
+
converter = DateTimeSearchKeyConverter(FeaturesEnricher.CURRENT_DATE, silent_mode=True)
|
|
2960
3101
|
df = converter.convert(df)
|
|
2961
3102
|
return df
|
|
2962
3103
|
|
|
@@ -2984,17 +3125,37 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2984
3125
|
if len(cols) == 1:
|
|
2985
3126
|
return cols[0]
|
|
2986
3127
|
|
|
3128
|
+
@staticmethod
|
|
3129
|
+
def _get_ip_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
|
|
3130
|
+
cols = [col for col, t in search_keys.items() if t == SearchKey.IP]
|
|
3131
|
+
if len(cols) > 1:
|
|
3132
|
+
raise Exception("More than one ip column found after unnest")
|
|
3133
|
+
if len(cols) == 1:
|
|
3134
|
+
return cols[0]
|
|
3135
|
+
|
|
2987
3136
|
@staticmethod
|
|
2988
3137
|
def _get_phone_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
|
|
2989
3138
|
for col, t in search_keys.items():
|
|
2990
3139
|
if t == SearchKey.PHONE:
|
|
2991
3140
|
return col
|
|
2992
3141
|
|
|
3142
|
+
@staticmethod
|
|
3143
|
+
def _get_country_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
|
|
3144
|
+
for col, t in search_keys.items():
|
|
3145
|
+
if t == SearchKey.COUNTRY:
|
|
3146
|
+
return col
|
|
3147
|
+
|
|
3148
|
+
@staticmethod
|
|
3149
|
+
def _get_postal_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
|
|
3150
|
+
for col, t in search_keys.items():
|
|
3151
|
+
if t == SearchKey.POSTAL_CODE:
|
|
3152
|
+
return col
|
|
3153
|
+
|
|
2993
3154
|
def _explode_multiple_search_keys(
|
|
2994
|
-
self, df: pd.DataFrame, search_keys: Dict[str, SearchKey]
|
|
3155
|
+
self, df: pd.DataFrame, search_keys: Dict[str, SearchKey], columns_renaming: Dict[str, str]
|
|
2995
3156
|
) -> Tuple[pd.DataFrame, Dict[str, List[str]]]:
|
|
2996
3157
|
# find groups of multiple search keys
|
|
2997
|
-
search_key_names_by_type: Dict[SearchKey, str] = {}
|
|
3158
|
+
search_key_names_by_type: Dict[SearchKey, List[str]] = {}
|
|
2998
3159
|
for key_name, key_type in search_keys.items():
|
|
2999
3160
|
search_key_names_by_type[key_type] = search_key_names_by_type.get(key_type, []) + [key_name]
|
|
3000
3161
|
search_key_names_by_type = {
|
|
@@ -3018,6 +3179,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3018
3179
|
del search_keys[old_key]
|
|
3019
3180
|
search_keys[new_search_key] = key_type
|
|
3020
3181
|
unnest_search_keys[new_search_key] = key_names
|
|
3182
|
+
columns_renaming[new_search_key] = new_search_key
|
|
3021
3183
|
|
|
3022
3184
|
df = pd.concat(exploded_dfs, ignore_index=True)
|
|
3023
3185
|
return df, unnest_search_keys
|
|
@@ -3025,7 +3187,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3025
3187
|
def __add_fit_system_record_id(
|
|
3026
3188
|
self,
|
|
3027
3189
|
df: pd.DataFrame,
|
|
3028
|
-
meaning_types: Dict[str, FileColumnMeaningType],
|
|
3190
|
+
# meaning_types: Dict[str, FileColumnMeaningType],
|
|
3029
3191
|
search_keys: Dict[str, SearchKey],
|
|
3030
3192
|
id_name: str,
|
|
3031
3193
|
) -> pd.DataFrame:
|
|
@@ -3048,9 +3210,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3048
3210
|
]
|
|
3049
3211
|
if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
|
|
3050
3212
|
date_column = DateTimeSearchKeyConverter.DATETIME_COL
|
|
3051
|
-
sort_exclude_columns.append(
|
|
3213
|
+
sort_exclude_columns.append(SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME]))
|
|
3052
3214
|
else:
|
|
3053
|
-
date_column =
|
|
3215
|
+
date_column = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
|
|
3054
3216
|
sort_columns = [date_column] if date_column is not None else []
|
|
3055
3217
|
|
|
3056
3218
|
other_columns = sorted(
|
|
@@ -3059,13 +3221,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3059
3221
|
for c in df.columns
|
|
3060
3222
|
if c not in sort_columns and c not in sort_exclude_columns and df[c].nunique() > 1
|
|
3061
3223
|
]
|
|
3062
|
-
# [
|
|
3063
|
-
# sk
|
|
3064
|
-
# for sk, key_type in search_keys.items()
|
|
3065
|
-
# if key_type not in [SearchKey.DATE, SearchKey.DATETIME]
|
|
3066
|
-
# and sk in df.columns
|
|
3067
|
-
# and df[sk].nunique() > 1 # don't use constant keys for hash
|
|
3068
|
-
# ]
|
|
3069
3224
|
)
|
|
3070
3225
|
|
|
3071
3226
|
search_keys_hash = "search_keys_hash"
|
|
@@ -3078,9 +3233,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3078
3233
|
if search_keys_hash in df.columns:
|
|
3079
3234
|
df.drop(columns=search_keys_hash, inplace=True)
|
|
3080
3235
|
|
|
3081
|
-
if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
|
|
3082
|
-
df.drop(columns=DateTimeSearchKeyConverter.DATETIME_COL, inplace=True)
|
|
3083
|
-
|
|
3084
3236
|
df = df.reset_index(drop=True).reset_index()
|
|
3085
3237
|
# system_record_id saves correct order for fit
|
|
3086
3238
|
df = df.rename(columns={DEFAULT_INDEX: id_name})
|
|
@@ -3090,11 +3242,11 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3090
3242
|
df.index.name = original_index_name
|
|
3091
3243
|
df = df.sort_values(by=original_order_name).drop(columns=original_order_name)
|
|
3092
3244
|
|
|
3093
|
-
meaning_types[id_name] = (
|
|
3094
|
-
|
|
3095
|
-
|
|
3096
|
-
|
|
3097
|
-
)
|
|
3245
|
+
# meaning_types[id_name] = (
|
|
3246
|
+
# FileColumnMeaningType.SYSTEM_RECORD_ID
|
|
3247
|
+
# if id_name == SYSTEM_RECORD_ID
|
|
3248
|
+
# else FileColumnMeaningType.ENTITY_SYSTEM_RECORD_ID
|
|
3249
|
+
# )
|
|
3098
3250
|
return df
|
|
3099
3251
|
|
|
3100
3252
|
def __correct_target(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
@@ -3472,10 +3624,13 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3472
3624
|
for _, key_type in search_keys.items():
|
|
3473
3625
|
if not isinstance(key_type, SearchKey):
|
|
3474
3626
|
raise ValidationError(self.bundle.get("unsupported_type_of_search_key").format(key_type))
|
|
3627
|
+
|
|
3475
3628
|
valid_search_keys = {}
|
|
3476
3629
|
unsupported_search_keys = {
|
|
3477
3630
|
SearchKey.IP_RANGE_FROM,
|
|
3478
3631
|
SearchKey.IP_RANGE_TO,
|
|
3632
|
+
SearchKey.IPV6_RANGE_FROM,
|
|
3633
|
+
SearchKey.IPV6_RANGE_TO,
|
|
3479
3634
|
SearchKey.MSISDN_RANGE_FROM,
|
|
3480
3635
|
SearchKey.MSISDN_RANGE_TO,
|
|
3481
3636
|
# SearchKey.EMAIL_ONE_DOMAIN,
|
|
@@ -3565,6 +3720,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3565
3720
|
print(msg)
|
|
3566
3721
|
self.logger.warning(msg)
|
|
3567
3722
|
self.warning_counter.increment()
|
|
3723
|
+
# TODO maybe raise ValidationError
|
|
3568
3724
|
|
|
3569
3725
|
self.logger.info(f"Prepared search keys: {valid_search_keys}")
|
|
3570
3726
|
|