upgini 1.1.309a1__py3-none-any.whl → 1.1.309a3511.dev2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- upgini/__about__.py +1 -1
- upgini/autofe/all_operands.py +26 -7
- upgini/autofe/binary.py +93 -2
- upgini/autofe/date.py +16 -3
- upgini/autofe/feature.py +24 -11
- upgini/autofe/unary.py +7 -0
- upgini/dataset.py +385 -30
- upgini/features_enricher.py +120 -276
- upgini/metadata.py +1 -16
- upgini/normalizer/phone_normalizer.py +340 -0
- upgini/utils/country_utils.py +0 -16
- upgini/utils/datetime_utils.py +15 -34
- upgini/utils/email_utils.py +5 -19
- upgini/utils/ip_utils.py +1 -100
- upgini/utils/phone_utils.py +0 -345
- upgini/utils/postal_code_utils.py +0 -34
- {upgini-1.1.309a1.dist-info → upgini-1.1.309a3511.dev2.dist-info}/METADATA +3 -1
- {upgini-1.1.309a1.dist-info → upgini-1.1.309a3511.dev2.dist-info}/RECORD +20 -20
- {upgini-1.1.309a1.dist-info → upgini-1.1.309a3511.dev2.dist-info}/WHEEL +1 -1
- upgini/normalizer/normalize_utils.py +0 -203
- {upgini-1.1.309a1.dist-info → upgini-1.1.309a3511.dev2.dist-info}/licenses/LICENSE +0 -0
upgini/features_enricher.py
CHANGED
|
@@ -61,15 +61,11 @@ from upgini.metadata import (
|
|
|
61
61
|
SearchKey,
|
|
62
62
|
)
|
|
63
63
|
from upgini.metrics import EstimatorWrapper, validate_scoring_argument
|
|
64
|
-
from upgini.normalizer.normalize_utils import Normalizer
|
|
65
64
|
from upgini.resource_bundle import ResourceBundle, bundle, get_custom_bundle
|
|
66
65
|
from upgini.search_task import SearchTask
|
|
67
66
|
from upgini.spinner import Spinner
|
|
68
67
|
from upgini.utils import combine_search_keys, find_numbers_with_decimal_comma
|
|
69
|
-
from upgini.utils.country_utils import
|
|
70
|
-
CountrySearchKeyConverter,
|
|
71
|
-
CountrySearchKeyDetector,
|
|
72
|
-
)
|
|
68
|
+
from upgini.utils.country_utils import CountrySearchKeyDetector
|
|
73
69
|
from upgini.utils.custom_loss_utils import (
|
|
74
70
|
get_additional_params_custom_loss,
|
|
75
71
|
get_runtime_params_custom_loss,
|
|
@@ -94,12 +90,8 @@ from upgini.utils.display_utils import (
|
|
|
94
90
|
from upgini.utils.email_utils import EmailSearchKeyConverter, EmailSearchKeyDetector
|
|
95
91
|
from upgini.utils.features_validator import FeaturesValidator
|
|
96
92
|
from upgini.utils.format import Format
|
|
97
|
-
from upgini.utils.
|
|
98
|
-
from upgini.utils.
|
|
99
|
-
from upgini.utils.postal_code_utils import (
|
|
100
|
-
PostalCodeSearchKeyConverter,
|
|
101
|
-
PostalCodeSearchKeyDetector,
|
|
102
|
-
)
|
|
93
|
+
from upgini.utils.phone_utils import PhoneSearchKeyDetector
|
|
94
|
+
from upgini.utils.postal_code_utils import PostalCodeSearchKeyDetector
|
|
103
95
|
|
|
104
96
|
try:
|
|
105
97
|
from upgini.utils.progress_bar import CustomProgressBar as ProgressBar
|
|
@@ -245,7 +237,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
245
237
|
|
|
246
238
|
self.passed_features: List[str] = []
|
|
247
239
|
self.df_with_original_index: Optional[pd.DataFrame] = None
|
|
248
|
-
self.fit_columns_renaming: Optional[Dict[str, str]] = None
|
|
249
240
|
self.country_added = False
|
|
250
241
|
self.fit_generated_features: List[str] = []
|
|
251
242
|
self.fit_dropped_features: Set[str] = set()
|
|
@@ -256,7 +247,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
256
247
|
self.eval_set: Optional[List[Tuple]] = None
|
|
257
248
|
self.autodetected_search_keys: Dict[str, SearchKey] = {}
|
|
258
249
|
self.imbalanced = False
|
|
259
|
-
self.__cached_sampled_datasets: Optional[Tuple[pd.DataFrame, pd.DataFrame, pd.Series, Dict, Dict
|
|
250
|
+
self.__cached_sampled_datasets: Optional[Tuple[pd.DataFrame, pd.DataFrame, pd.Series, Dict, Dict]] = None
|
|
260
251
|
|
|
261
252
|
validate_version(self.logger)
|
|
262
253
|
self.search_keys = search_keys or {}
|
|
@@ -715,7 +706,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
715
706
|
|
|
716
707
|
start_time = time.time()
|
|
717
708
|
try:
|
|
718
|
-
result
|
|
709
|
+
result = self.__inner_transform(
|
|
719
710
|
trace_id,
|
|
720
711
|
X,
|
|
721
712
|
exclude_features_sources=exclude_features_sources,
|
|
@@ -915,16 +906,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
915
906
|
search_keys,
|
|
916
907
|
groups,
|
|
917
908
|
_cv,
|
|
918
|
-
columns_renaming,
|
|
919
909
|
) = prepared_data
|
|
920
910
|
|
|
921
|
-
# rename cat_features
|
|
922
|
-
if cat_features:
|
|
923
|
-
for new_c, old_c in columns_renaming.items():
|
|
924
|
-
if old_c in cat_features:
|
|
925
|
-
cat_features.remove(old_c)
|
|
926
|
-
cat_features.append(new_c)
|
|
927
|
-
|
|
928
911
|
gc.collect()
|
|
929
912
|
|
|
930
913
|
print(self.bundle.get("metrics_start"))
|
|
@@ -937,7 +920,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
937
920
|
|
|
938
921
|
self._check_train_and_eval_target_distribution(y_sorted, fitting_eval_set_dict)
|
|
939
922
|
|
|
940
|
-
has_date =
|
|
923
|
+
has_date = self._get_date_column(search_keys) is not None
|
|
941
924
|
model_task_type = self.model_task_type or define_task(y_sorted, has_date, self.logger, silent=True)
|
|
942
925
|
|
|
943
926
|
wrapper = EstimatorWrapper.create(
|
|
@@ -1130,7 +1113,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1130
1113
|
)
|
|
1131
1114
|
|
|
1132
1115
|
uplift_col = self.bundle.get("quality_metrics_uplift_header")
|
|
1133
|
-
date_column =
|
|
1116
|
+
date_column = self._get_date_column(search_keys)
|
|
1134
1117
|
if (
|
|
1135
1118
|
uplift_col in metrics_df.columns
|
|
1136
1119
|
and (metrics_df[uplift_col] < 0).any()
|
|
@@ -1218,11 +1201,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1218
1201
|
|
|
1219
1202
|
extended_X = x.copy()
|
|
1220
1203
|
generated_features = []
|
|
1221
|
-
date_column =
|
|
1204
|
+
date_column = self._get_date_column(search_keys)
|
|
1222
1205
|
if date_column is not None:
|
|
1223
|
-
converter = DateTimeSearchKeyConverter(
|
|
1224
|
-
date_column, self.date_format, self.logger, self.bundle, silent_mode=True
|
|
1225
|
-
)
|
|
1206
|
+
converter = DateTimeSearchKeyConverter(date_column, self.date_format, self.logger, self.bundle)
|
|
1226
1207
|
extended_X = converter.convert(extended_X, keep_time=True)
|
|
1227
1208
|
generated_features.extend(converter.generated_features)
|
|
1228
1209
|
email_column = self._get_email_column(search_keys)
|
|
@@ -1278,7 +1259,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1278
1259
|
groups = None
|
|
1279
1260
|
|
|
1280
1261
|
if not isinstance(_cv, BaseCrossValidator):
|
|
1281
|
-
date_column =
|
|
1262
|
+
date_column = self._get_date_column(search_keys)
|
|
1282
1263
|
date_series = X[date_column] if date_column is not None else None
|
|
1283
1264
|
_cv, groups = CVConfig(
|
|
1284
1265
|
_cv, date_series, self.random_state, self._search_task.get_shuffle_kfold(), group_columns=group_columns
|
|
@@ -1301,7 +1282,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1301
1282
|
|
|
1302
1283
|
def _get_client_cat_features(
|
|
1303
1284
|
self, estimator: Optional[Any], X: pd.DataFrame, search_keys: Dict[str, SearchKey]
|
|
1304
|
-
) ->
|
|
1285
|
+
) -> Optional[List[str]]:
|
|
1305
1286
|
cat_features = None
|
|
1306
1287
|
search_keys_for_metrics = []
|
|
1307
1288
|
if (
|
|
@@ -1361,15 +1342,11 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1361
1342
|
progress_bar,
|
|
1362
1343
|
progress_callback,
|
|
1363
1344
|
)
|
|
1364
|
-
X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys
|
|
1365
|
-
sampled_data
|
|
1366
|
-
)
|
|
1345
|
+
X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys = dataclasses.astuple(sampled_data)
|
|
1367
1346
|
|
|
1368
1347
|
excluding_search_keys = list(search_keys.keys())
|
|
1369
1348
|
if search_keys_for_metrics is not None and len(search_keys_for_metrics) > 0:
|
|
1370
|
-
for sk in excluding_search_keys
|
|
1371
|
-
if columns_renaming.get(sk) in search_keys_for_metrics:
|
|
1372
|
-
excluding_search_keys.remove(sk)
|
|
1349
|
+
excluding_search_keys = [sk for sk in excluding_search_keys if sk not in search_keys_for_metrics]
|
|
1373
1350
|
|
|
1374
1351
|
client_features = [
|
|
1375
1352
|
c
|
|
@@ -1415,7 +1392,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1415
1392
|
fitting_X = fitting_X.drop(columns=constant_columns, errors="ignore")
|
|
1416
1393
|
fitting_enriched_X = fitting_enriched_X.drop(columns=constant_columns, errors="ignore")
|
|
1417
1394
|
|
|
1418
|
-
# TODO maybe there is no more need for these convertions
|
|
1419
1395
|
# Remove datetime features
|
|
1420
1396
|
datetime_features = [
|
|
1421
1397
|
f for f in fitting_X.columns if is_datetime64_any_dtype(fitting_X[f]) or is_period_dtype(fitting_X[f])
|
|
@@ -1503,7 +1479,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1503
1479
|
search_keys,
|
|
1504
1480
|
groups,
|
|
1505
1481
|
cv,
|
|
1506
|
-
columns_renaming,
|
|
1507
1482
|
)
|
|
1508
1483
|
|
|
1509
1484
|
@dataclass
|
|
@@ -1513,7 +1488,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1513
1488
|
enriched_X: pd.DataFrame
|
|
1514
1489
|
eval_set_sampled_dict: Dict[int, Tuple[pd.DataFrame, pd.Series]]
|
|
1515
1490
|
search_keys: Dict[str, SearchKey]
|
|
1516
|
-
columns_renaming: Dict[str, str]
|
|
1517
1491
|
|
|
1518
1492
|
def _sample_data_for_metrics(
|
|
1519
1493
|
self,
|
|
@@ -1553,15 +1527,11 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1553
1527
|
)
|
|
1554
1528
|
|
|
1555
1529
|
def __get_sampled_cached_enriched(self, exclude_features_sources: Optional[List[str]]) -> _SampledDataForMetrics:
|
|
1556
|
-
X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys
|
|
1557
|
-
self.__cached_sampled_datasets
|
|
1558
|
-
)
|
|
1530
|
+
X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys = self.__cached_sampled_datasets
|
|
1559
1531
|
if exclude_features_sources:
|
|
1560
1532
|
enriched_X = enriched_X.drop(columns=exclude_features_sources, errors="ignore")
|
|
1561
1533
|
|
|
1562
|
-
return self.__mk_sampled_data_tuple(
|
|
1563
|
-
X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys, columns_renaming
|
|
1564
|
-
)
|
|
1534
|
+
return self.__mk_sampled_data_tuple(X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys)
|
|
1565
1535
|
|
|
1566
1536
|
def __sample_only_input(
|
|
1567
1537
|
self, validated_X: pd.DataFrame, validated_y: pd.Series, eval_set: Optional[List[tuple]], is_demo_dataset: bool
|
|
@@ -1579,22 +1549,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1579
1549
|
eval_xy[EVAL_SET_INDEX] = idx + 1
|
|
1580
1550
|
df = pd.concat([df, eval_xy])
|
|
1581
1551
|
|
|
1582
|
-
search_keys = self.search_keys.copy()
|
|
1583
|
-
search_keys = self.__prepare_search_keys(df, search_keys, is_demo_dataset, is_transform=True, silent_mode=True)
|
|
1584
|
-
|
|
1585
|
-
date_column = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
|
|
1586
|
-
generated_features = []
|
|
1587
|
-
if date_column is not None:
|
|
1588
|
-
converter = DateTimeSearchKeyConverter(
|
|
1589
|
-
date_column, self.date_format, self.logger, self.bundle, silent_mode=True
|
|
1590
|
-
)
|
|
1591
|
-
df = converter.convert(df, keep_time=True)
|
|
1592
|
-
generated_features = converter.generated_features
|
|
1593
|
-
|
|
1594
|
-
normalizer = Normalizer(self.search_keys, generated_features, self.bundle, self.logger, self.warning_counter)
|
|
1595
|
-
df = normalizer.normalize(df)
|
|
1596
|
-
columns_renaming = normalizer.columns_renaming
|
|
1597
|
-
|
|
1598
1552
|
df = clean_full_duplicates(df, logger=self.logger, silent=True, bundle=self.bundle)
|
|
1599
1553
|
|
|
1600
1554
|
num_samples = _num_samples(df)
|
|
@@ -1607,41 +1561,24 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1607
1561
|
self.logger.info(f"Downsampling from {num_samples} to {sample_rows}")
|
|
1608
1562
|
df = df.sample(n=sample_rows, random_state=self.random_state)
|
|
1609
1563
|
|
|
1610
|
-
|
|
1611
|
-
|
|
1612
|
-
if email_column:
|
|
1613
|
-
converter = EmailSearchKeyConverter(
|
|
1614
|
-
email_column, hem_column, search_keys, columns_renaming, [], self.bundle, self.logger
|
|
1615
|
-
)
|
|
1616
|
-
df = converter.convert(df)
|
|
1617
|
-
df = self.__add_fit_system_record_id(df, search_keys, SYSTEM_RECORD_ID)
|
|
1618
|
-
if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
|
|
1619
|
-
df = df.drop(columns=DateTimeSearchKeyConverter.DATETIME_COL)
|
|
1564
|
+
df_extended, search_keys = self._extend_x(df, is_demo_dataset)
|
|
1565
|
+
df_extended = self.__add_fit_system_record_id(df_extended, {}, search_keys, SYSTEM_RECORD_ID)
|
|
1620
1566
|
|
|
1621
|
-
train_df =
|
|
1567
|
+
train_df = df_extended.query(f"{EVAL_SET_INDEX} == 0") if eval_set is not None else df_extended
|
|
1622
1568
|
X_sampled = train_df.drop(columns=[TARGET, EVAL_SET_INDEX], errors="ignore")
|
|
1623
1569
|
y_sampled = train_df[TARGET].copy()
|
|
1624
1570
|
enriched_X = X_sampled
|
|
1625
1571
|
|
|
1626
1572
|
if eval_set is not None:
|
|
1627
1573
|
for idx in range(len(eval_set)):
|
|
1628
|
-
eval_xy_sampled =
|
|
1574
|
+
eval_xy_sampled = df_extended.query(f"{EVAL_SET_INDEX} == {idx + 1}")
|
|
1629
1575
|
eval_X_sampled = eval_xy_sampled.drop(columns=[TARGET, EVAL_SET_INDEX], errors="ignore")
|
|
1630
1576
|
eval_y_sampled = eval_xy_sampled[TARGET].copy()
|
|
1631
1577
|
enriched_eval_X = eval_X_sampled
|
|
1632
1578
|
eval_set_sampled_dict[idx] = (eval_X_sampled, enriched_eval_X, eval_y_sampled)
|
|
1633
|
-
self.__cached_sampled_datasets = (
|
|
1634
|
-
X_sampled,
|
|
1635
|
-
y_sampled,
|
|
1636
|
-
enriched_X,
|
|
1637
|
-
eval_set_sampled_dict,
|
|
1638
|
-
search_keys,
|
|
1639
|
-
columns_renaming,
|
|
1640
|
-
)
|
|
1579
|
+
self.__cached_sampled_datasets = (X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys)
|
|
1641
1580
|
|
|
1642
|
-
return self.__mk_sampled_data_tuple(
|
|
1643
|
-
X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys, columns_renaming
|
|
1644
|
-
)
|
|
1581
|
+
return self.__mk_sampled_data_tuple(X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys)
|
|
1645
1582
|
|
|
1646
1583
|
def __sample_balanced(
|
|
1647
1584
|
self,
|
|
@@ -1653,7 +1590,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1653
1590
|
search_keys = self.fit_search_keys
|
|
1654
1591
|
|
|
1655
1592
|
rows_to_drop = None
|
|
1656
|
-
has_date =
|
|
1593
|
+
has_date = self._get_date_column(search_keys) is not None
|
|
1657
1594
|
task_type = self.model_task_type or define_task(
|
|
1658
1595
|
self.df_with_original_index[TARGET], has_date, self.logger, silent=True
|
|
1659
1596
|
)
|
|
@@ -1707,18 +1644,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1707
1644
|
enriched_eval_X = enriched_eval_sets[idx + 1].drop(columns=[TARGET, EVAL_SET_INDEX])
|
|
1708
1645
|
eval_set_sampled_dict[idx] = (eval_X_sampled, enriched_eval_X, eval_y_sampled)
|
|
1709
1646
|
|
|
1710
|
-
self.__cached_sampled_datasets = (
|
|
1711
|
-
X_sampled,
|
|
1712
|
-
y_sampled,
|
|
1713
|
-
enriched_X,
|
|
1714
|
-
eval_set_sampled_dict,
|
|
1715
|
-
search_keys,
|
|
1716
|
-
self.fit_columns_renaming,
|
|
1717
|
-
)
|
|
1647
|
+
self.__cached_sampled_datasets = (X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys)
|
|
1718
1648
|
|
|
1719
|
-
return self.__mk_sampled_data_tuple(
|
|
1720
|
-
X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys, self.fit_columns_renaming
|
|
1721
|
-
)
|
|
1649
|
+
return self.__mk_sampled_data_tuple(X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys)
|
|
1722
1650
|
|
|
1723
1651
|
def __sample_imbalanced(
|
|
1724
1652
|
self,
|
|
@@ -1758,7 +1686,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1758
1686
|
tmp_target_name = "__target"
|
|
1759
1687
|
df = df.rename(columns={TARGET: tmp_target_name})
|
|
1760
1688
|
|
|
1761
|
-
enriched_df
|
|
1689
|
+
enriched_df = self.__inner_transform(
|
|
1762
1690
|
trace_id,
|
|
1763
1691
|
df,
|
|
1764
1692
|
exclude_features_sources=exclude_features_sources,
|
|
@@ -1806,7 +1734,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1806
1734
|
tmp_target_name = "__target"
|
|
1807
1735
|
df = df.rename(columns={TARGET: tmp_target_name})
|
|
1808
1736
|
|
|
1809
|
-
enriched_Xy
|
|
1737
|
+
enriched_Xy = self.__inner_transform(
|
|
1810
1738
|
trace_id,
|
|
1811
1739
|
df,
|
|
1812
1740
|
exclude_features_sources=exclude_features_sources,
|
|
@@ -1831,18 +1759,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1831
1759
|
y_sampled = enriched_Xy[TARGET].copy()
|
|
1832
1760
|
enriched_X = enriched_Xy.drop(columns=TARGET)
|
|
1833
1761
|
|
|
1834
|
-
self.__cached_sampled_datasets = (
|
|
1835
|
-
X_sampled,
|
|
1836
|
-
y_sampled,
|
|
1837
|
-
enriched_X,
|
|
1838
|
-
eval_set_sampled_dict,
|
|
1839
|
-
self.search_keys,
|
|
1840
|
-
columns_renaming,
|
|
1841
|
-
)
|
|
1762
|
+
self.__cached_sampled_datasets = (X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, self.search_keys)
|
|
1842
1763
|
|
|
1843
|
-
return self.__mk_sampled_data_tuple(
|
|
1844
|
-
X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, self.search_keys, columns_renaming
|
|
1845
|
-
)
|
|
1764
|
+
return self.__mk_sampled_data_tuple(X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, self.search_keys)
|
|
1846
1765
|
|
|
1847
1766
|
def __mk_sampled_data_tuple(
|
|
1848
1767
|
self,
|
|
@@ -1851,7 +1770,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1851
1770
|
enriched_X: pd.DataFrame,
|
|
1852
1771
|
eval_set_sampled_dict: Dict,
|
|
1853
1772
|
search_keys: Dict,
|
|
1854
|
-
columns_renaming: Dict[str, str],
|
|
1855
1773
|
):
|
|
1856
1774
|
search_keys = {k: v for k, v in search_keys.items() if k in X_sampled.columns.to_list()}
|
|
1857
1775
|
return FeaturesEnricher._SampledDataForMetrics(
|
|
@@ -1860,7 +1778,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1860
1778
|
enriched_X=enriched_X,
|
|
1861
1779
|
eval_set_sampled_dict=eval_set_sampled_dict,
|
|
1862
1780
|
search_keys=search_keys,
|
|
1863
|
-
columns_renaming=columns_renaming,
|
|
1864
1781
|
)
|
|
1865
1782
|
|
|
1866
1783
|
def get_search_id(self) -> Optional[str]:
|
|
@@ -1949,7 +1866,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1949
1866
|
progress_bar: Optional[ProgressBar] = None,
|
|
1950
1867
|
progress_callback: Optional[Callable[[SearchProgress], Any]] = None,
|
|
1951
1868
|
add_fit_system_record_id: bool = False,
|
|
1952
|
-
) ->
|
|
1869
|
+
) -> pd.DataFrame:
|
|
1953
1870
|
if self._search_task is None:
|
|
1954
1871
|
raise NotFittedError(self.bundle.get("transform_unfitted_enricher"))
|
|
1955
1872
|
|
|
@@ -1962,13 +1879,13 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1962
1879
|
|
|
1963
1880
|
if len(self.feature_names_) == 0:
|
|
1964
1881
|
self.logger.warning(self.bundle.get("no_important_features_for_transform"))
|
|
1965
|
-
return X
|
|
1882
|
+
return X
|
|
1966
1883
|
|
|
1967
1884
|
if self._has_paid_features(exclude_features_sources):
|
|
1968
1885
|
msg = self.bundle.get("transform_with_paid_features")
|
|
1969
1886
|
self.logger.warning(msg)
|
|
1970
1887
|
self.__display_support_link(msg)
|
|
1971
|
-
return None
|
|
1888
|
+
return None
|
|
1972
1889
|
|
|
1973
1890
|
if not metrics_calculation:
|
|
1974
1891
|
transform_usage = self.rest_client.get_current_transform_usage(trace_id)
|
|
@@ -1979,7 +1896,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1979
1896
|
self.logger.warning(msg)
|
|
1980
1897
|
print(msg)
|
|
1981
1898
|
show_request_quote_button()
|
|
1982
|
-
return None
|
|
1899
|
+
return None
|
|
1983
1900
|
else:
|
|
1984
1901
|
msg = self.bundle.get("transform_usage_info").format(
|
|
1985
1902
|
transform_usage.limit, transform_usage.transformed_rows
|
|
@@ -2017,11 +1934,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2017
1934
|
df = self.__add_country_code(df, search_keys)
|
|
2018
1935
|
|
|
2019
1936
|
generated_features = []
|
|
2020
|
-
date_column =
|
|
1937
|
+
date_column = self._get_date_column(search_keys)
|
|
2021
1938
|
if date_column is not None:
|
|
2022
|
-
converter = DateTimeSearchKeyConverter(
|
|
2023
|
-
date_column, self.date_format, self.logger, bundle=self.bundle, silent_mode=silent_mode
|
|
2024
|
-
)
|
|
1939
|
+
converter = DateTimeSearchKeyConverter(date_column, self.date_format, self.logger, bundle=self.bundle)
|
|
2025
1940
|
df = converter.convert(df)
|
|
2026
1941
|
self.logger.info(f"Date column after convertion: {df[date_column]}")
|
|
2027
1942
|
generated_features.extend(converter.generated_features)
|
|
@@ -2030,87 +1945,61 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2030
1945
|
if self.add_date_if_missing:
|
|
2031
1946
|
df = self._add_current_date_as_key(df, search_keys, self.logger, self.bundle)
|
|
2032
1947
|
|
|
2033
|
-
normalizer = Normalizer(
|
|
2034
|
-
search_keys, generated_features, self.bundle, self.logger, self.warning_counter, silent_mode
|
|
2035
|
-
)
|
|
2036
|
-
df = normalizer.normalize(df)
|
|
2037
|
-
columns_renaming = normalizer.columns_renaming
|
|
2038
|
-
|
|
2039
1948
|
# Don't pass all features in backend on transform
|
|
1949
|
+
original_features_for_transform = []
|
|
2040
1950
|
runtime_parameters = self._get_copy_of_runtime_parameters()
|
|
2041
|
-
|
|
2042
|
-
if len(
|
|
2043
|
-
|
|
1951
|
+
features_not_to_pass = [column for column in df.columns if column not in search_keys.keys()]
|
|
1952
|
+
if len(features_not_to_pass) > 0:
|
|
1953
|
+
# Pass only features that need for transform
|
|
1954
|
+
features_for_transform = self._search_task.get_features_for_transform()
|
|
1955
|
+
if features_for_transform is not None and len(features_for_transform) > 0:
|
|
1956
|
+
file_metadata = self._search_task.get_file_metadata(trace_id)
|
|
1957
|
+
original_features_for_transform = [
|
|
1958
|
+
c.originalName or c.name for c in file_metadata.columns if c.name in features_for_transform
|
|
1959
|
+
]
|
|
1960
|
+
|
|
1961
|
+
runtime_parameters.properties["features_for_embeddings"] = ",".join(features_for_transform)
|
|
2044
1962
|
|
|
2045
|
-
columns_for_system_record_id = sorted(list(search_keys.keys()) +
|
|
1963
|
+
columns_for_system_record_id = sorted(list(search_keys.keys()) + (original_features_for_transform))
|
|
2046
1964
|
|
|
2047
1965
|
df[ENTITY_SYSTEM_RECORD_ID] = pd.util.hash_pandas_object(
|
|
2048
1966
|
df[columns_for_system_record_id], index=False
|
|
2049
1967
|
).astype("Float64")
|
|
2050
1968
|
|
|
2051
1969
|
# Explode multiple search keys
|
|
2052
|
-
df, unnest_search_keys = self._explode_multiple_search_keys(df, search_keys
|
|
1970
|
+
df, unnest_search_keys = self._explode_multiple_search_keys(df, search_keys)
|
|
2053
1971
|
|
|
2054
1972
|
email_column = self._get_email_column(search_keys)
|
|
2055
1973
|
hem_column = self._get_hem_column(search_keys)
|
|
2056
|
-
|
|
1974
|
+
email_converted_to_hem = False
|
|
2057
1975
|
if email_column:
|
|
2058
1976
|
converter = EmailSearchKeyConverter(
|
|
2059
|
-
email_column,
|
|
2060
|
-
hem_column,
|
|
2061
|
-
search_keys,
|
|
2062
|
-
columns_renaming,
|
|
2063
|
-
list(unnest_search_keys.keys()),
|
|
2064
|
-
self.logger,
|
|
1977
|
+
email_column, hem_column, search_keys, list(unnest_search_keys.keys()), self.logger
|
|
2065
1978
|
)
|
|
2066
1979
|
df = converter.convert(df)
|
|
2067
1980
|
generated_features.extend(converter.generated_features)
|
|
2068
|
-
|
|
2069
|
-
ip_column = self._get_ip_column(search_keys)
|
|
2070
|
-
if ip_column:
|
|
2071
|
-
converter = IpSearchKeyConverter(
|
|
2072
|
-
ip_column,
|
|
2073
|
-
search_keys,
|
|
2074
|
-
columns_renaming,
|
|
2075
|
-
list(unnest_search_keys.keys()),
|
|
2076
|
-
self.bundle,
|
|
2077
|
-
self.logger,
|
|
2078
|
-
)
|
|
2079
|
-
df = converter.convert(df)
|
|
2080
|
-
|
|
2081
|
-
phone_column = self._get_phone_column(search_keys)
|
|
2082
|
-
country_column = self._get_country_column(search_keys)
|
|
2083
|
-
if phone_column:
|
|
2084
|
-
converter = PhoneSearchKeyConverter(phone_column, country_column)
|
|
2085
|
-
df = converter.convert(df)
|
|
2086
|
-
|
|
2087
|
-
if country_column:
|
|
2088
|
-
converter = CountrySearchKeyConverter(country_column)
|
|
2089
|
-
df = converter.convert(df)
|
|
2090
|
-
|
|
2091
|
-
postal_code = self._get_postal_column(search_keys)
|
|
2092
|
-
if postal_code:
|
|
2093
|
-
converter = PostalCodeSearchKeyConverter(postal_code)
|
|
2094
|
-
df = converter.convert(df)
|
|
2095
|
-
|
|
1981
|
+
email_converted_to_hem = converter.email_converted_to_hem
|
|
2096
1982
|
generated_features = [f for f in generated_features if f in self.fit_generated_features]
|
|
2097
1983
|
|
|
2098
1984
|
meaning_types = {col: key.value for col, key in search_keys.items()}
|
|
2099
|
-
for
|
|
1985
|
+
# non_keys_columns = [column for column in df.columns if column not in search_keys.keys()]
|
|
1986
|
+
for col in original_features_for_transform:
|
|
2100
1987
|
meaning_types[col] = FileColumnMeaningType.FEATURE
|
|
2101
|
-
features_not_to_pass = [
|
|
2102
|
-
|
|
2103
|
-
|
|
1988
|
+
features_not_to_pass = [column for column in features_not_to_pass if column not in search_keys.keys()]
|
|
1989
|
+
|
|
1990
|
+
if email_converted_to_hem:
|
|
1991
|
+
features_not_to_pass.append(email_column)
|
|
1992
|
+
|
|
1993
|
+
features_not_to_pass = [c for c in features_not_to_pass if c not in original_features_for_transform]
|
|
1994
|
+
columns_for_system_record_id = sorted(list(search_keys.keys()) + (original_features_for_transform))
|
|
2104
1995
|
|
|
2105
1996
|
if add_fit_system_record_id:
|
|
2106
|
-
df = self.__add_fit_system_record_id(df, search_keys, SYSTEM_RECORD_ID)
|
|
2107
|
-
if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
|
|
2108
|
-
df = df.drop(columns=DateTimeSearchKeyConverter.DATETIME_COL)
|
|
1997
|
+
df = self.__add_fit_system_record_id(df, {}, search_keys, SYSTEM_RECORD_ID)
|
|
2109
1998
|
df = df.rename(columns={SYSTEM_RECORD_ID: SORT_ID})
|
|
2110
1999
|
features_not_to_pass.append(SORT_ID)
|
|
2111
2000
|
|
|
2112
|
-
|
|
2113
|
-
|
|
2001
|
+
columns_for_system_record_id = sorted(list(search_keys.keys()) + (original_features_for_transform or []))
|
|
2002
|
+
|
|
2114
2003
|
df[SYSTEM_RECORD_ID] = pd.util.hash_pandas_object(df[columns_for_system_record_id], index=False).astype(
|
|
2115
2004
|
"Float64"
|
|
2116
2005
|
)
|
|
@@ -2146,7 +2035,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2146
2035
|
rest_client=self.rest_client,
|
|
2147
2036
|
logger=self.logger,
|
|
2148
2037
|
)
|
|
2149
|
-
|
|
2038
|
+
if email_converted_to_hem:
|
|
2039
|
+
dataset.ignore_columns = [email_column]
|
|
2150
2040
|
|
|
2151
2041
|
if max_features is not None or importance_threshold is not None:
|
|
2152
2042
|
exclude_features_sources = list(
|
|
@@ -2248,7 +2138,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2248
2138
|
if add_fit_system_record_id:
|
|
2249
2139
|
result = result.rename(columns={SORT_ID: SYSTEM_RECORD_ID})
|
|
2250
2140
|
|
|
2251
|
-
return result
|
|
2141
|
+
return result
|
|
2252
2142
|
|
|
2253
2143
|
def _get_excluded_features(self, max_features: Optional[int], importance_threshold: Optional[float]) -> List[str]:
|
|
2254
2144
|
features_info = self._internal_features_info
|
|
@@ -2349,9 +2239,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2349
2239
|
self.df_with_original_index = None
|
|
2350
2240
|
self.__cached_sampled_datasets = None
|
|
2351
2241
|
self.metrics = None
|
|
2352
|
-
self.fit_columns_renaming = None
|
|
2353
|
-
self.fit_dropped_features = set()
|
|
2354
|
-
self.fit_generated_features = []
|
|
2355
2242
|
|
|
2356
2243
|
validated_X = self._validate_X(X)
|
|
2357
2244
|
validated_y = self._validate_y(validated_X, y)
|
|
@@ -2398,10 +2285,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2398
2285
|
self.fit_search_keys = self.search_keys.copy()
|
|
2399
2286
|
self.fit_search_keys = self.__prepare_search_keys(validated_X, self.fit_search_keys, is_demo_dataset)
|
|
2400
2287
|
|
|
2401
|
-
maybe_date_column =
|
|
2288
|
+
maybe_date_column = self._get_date_column(self.fit_search_keys)
|
|
2402
2289
|
has_date = maybe_date_column is not None
|
|
2403
2290
|
model_task_type = self.model_task_type or define_task(validated_y, has_date, self.logger)
|
|
2404
|
-
|
|
2405
2291
|
self._validate_binary_observations(validated_y, model_task_type)
|
|
2406
2292
|
|
|
2407
2293
|
self.runtime_parameters = get_runtime_params_custom_loss(
|
|
@@ -2431,13 +2317,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2431
2317
|
self.fit_generated_features = []
|
|
2432
2318
|
|
|
2433
2319
|
if has_date:
|
|
2434
|
-
converter = DateTimeSearchKeyConverter(
|
|
2435
|
-
maybe_date_column,
|
|
2436
|
-
self.date_format,
|
|
2437
|
-
self.logger,
|
|
2438
|
-
bundle=self.bundle,
|
|
2439
|
-
warnings_counter=self.warning_counter,
|
|
2440
|
-
)
|
|
2320
|
+
converter = DateTimeSearchKeyConverter(maybe_date_column, self.date_format, self.logger, bundle=self.bundle)
|
|
2441
2321
|
df = converter.convert(df, keep_time=True)
|
|
2442
2322
|
self.logger.info(f"Date column after convertion: {df[maybe_date_column]}")
|
|
2443
2323
|
self.fit_generated_features.extend(converter.generated_features)
|
|
@@ -2454,12 +2334,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2454
2334
|
|
|
2455
2335
|
self.__adjust_cv(df, maybe_date_column, model_task_type)
|
|
2456
2336
|
|
|
2457
|
-
|
|
2458
|
-
self.fit_search_keys, self.fit_generated_features, self.bundle, self.logger, self.warning_counter
|
|
2459
|
-
)
|
|
2460
|
-
df = normalizer.normalize(df)
|
|
2461
|
-
columns_renaming = normalizer.columns_renaming
|
|
2462
|
-
self.fit_columns_renaming = columns_renaming
|
|
2337
|
+
# TODO normalize and convert all columns
|
|
2463
2338
|
|
|
2464
2339
|
df = remove_fintech_duplicates(
|
|
2465
2340
|
df, self.fit_search_keys, date_format=self.date_format, logger=self.logger, bundle=self.bundle
|
|
@@ -2467,59 +2342,38 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2467
2342
|
df = clean_full_duplicates(df, self.logger, bundle=self.bundle)
|
|
2468
2343
|
|
|
2469
2344
|
# Explode multiple search keys
|
|
2470
|
-
|
|
2345
|
+
non_feature_columns = [self.TARGET_NAME, EVAL_SET_INDEX] + list(self.fit_search_keys.keys())
|
|
2346
|
+
meaning_types = {
|
|
2347
|
+
**{col: key.value for col, key in self.fit_search_keys.items()},
|
|
2348
|
+
**{str(c): FileColumnMeaningType.FEATURE for c in df.columns if c not in non_feature_columns},
|
|
2349
|
+
}
|
|
2350
|
+
meaning_types[self.TARGET_NAME] = FileColumnMeaningType.TARGET
|
|
2351
|
+
if eval_set is not None and len(eval_set) > 0:
|
|
2352
|
+
meaning_types[EVAL_SET_INDEX] = FileColumnMeaningType.EVAL_SET_INDEX
|
|
2353
|
+
df = self.__add_fit_system_record_id(df, meaning_types, self.fit_search_keys, ENTITY_SYSTEM_RECORD_ID)
|
|
2471
2354
|
|
|
2472
2355
|
# TODO check that this is correct for enrichment
|
|
2473
2356
|
self.df_with_original_index = df.copy()
|
|
2474
|
-
# TODO check maybe need to drop _time column from df_with_original_index
|
|
2475
2357
|
|
|
2476
|
-
df, unnest_search_keys = self._explode_multiple_search_keys(df, self.fit_search_keys
|
|
2358
|
+
df, unnest_search_keys = self._explode_multiple_search_keys(df, self.fit_search_keys)
|
|
2477
2359
|
|
|
2478
2360
|
# Convert EMAIL to HEM after unnesting to do it only with one column
|
|
2479
2361
|
email_column = self._get_email_column(self.fit_search_keys)
|
|
2480
2362
|
hem_column = self._get_hem_column(self.fit_search_keys)
|
|
2363
|
+
email_converted_to_hem = False
|
|
2481
2364
|
if email_column:
|
|
2482
2365
|
converter = EmailSearchKeyConverter(
|
|
2483
|
-
email_column,
|
|
2484
|
-
hem_column,
|
|
2485
|
-
self.fit_search_keys,
|
|
2486
|
-
columns_renaming,
|
|
2487
|
-
list(unnest_search_keys.keys()),
|
|
2488
|
-
self.logger,
|
|
2366
|
+
email_column, hem_column, self.fit_search_keys, list(unnest_search_keys.keys()), self.logger
|
|
2489
2367
|
)
|
|
2490
2368
|
df = converter.convert(df)
|
|
2491
2369
|
self.fit_generated_features.extend(converter.generated_features)
|
|
2492
|
-
|
|
2493
|
-
ip_column = self._get_ip_column(self.fit_search_keys)
|
|
2494
|
-
if ip_column:
|
|
2495
|
-
converter = IpSearchKeyConverter(
|
|
2496
|
-
ip_column,
|
|
2497
|
-
self.fit_search_keys,
|
|
2498
|
-
columns_renaming,
|
|
2499
|
-
list(unnest_search_keys.keys()),
|
|
2500
|
-
self.bundle,
|
|
2501
|
-
self.logger,
|
|
2502
|
-
)
|
|
2503
|
-
df = converter.convert(df)
|
|
2504
|
-
|
|
2505
|
-
phone_column = self._get_phone_column(self.fit_search_keys)
|
|
2506
|
-
country_column = self._get_country_column(self.fit_search_keys)
|
|
2507
|
-
if phone_column:
|
|
2508
|
-
converter = PhoneSearchKeyConverter(phone_column, country_column)
|
|
2509
|
-
df = converter.convert(df)
|
|
2510
|
-
|
|
2511
|
-
if country_column:
|
|
2512
|
-
converter = CountrySearchKeyConverter(country_column)
|
|
2513
|
-
df = converter.convert(df)
|
|
2514
|
-
|
|
2515
|
-
postal_code = self._get_postal_column(self.fit_search_keys)
|
|
2516
|
-
if postal_code:
|
|
2517
|
-
converter = PostalCodeSearchKeyConverter(postal_code)
|
|
2518
|
-
df = converter.convert(df)
|
|
2370
|
+
email_converted_to_hem = converter.email_converted_to_hem
|
|
2519
2371
|
|
|
2520
2372
|
non_feature_columns = [self.TARGET_NAME, EVAL_SET_INDEX, ENTITY_SYSTEM_RECORD_ID, SEARCH_KEY_UNNEST] + list(
|
|
2521
2373
|
self.fit_search_keys.keys()
|
|
2522
2374
|
)
|
|
2375
|
+
if email_converted_to_hem:
|
|
2376
|
+
non_feature_columns.append(email_column)
|
|
2523
2377
|
if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
|
|
2524
2378
|
non_feature_columns.append(DateTimeSearchKeyConverter.DATETIME_COL)
|
|
2525
2379
|
|
|
@@ -2531,6 +2385,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2531
2385
|
self.fit_dropped_features.update(features_to_drop)
|
|
2532
2386
|
df = df.drop(columns=features_to_drop)
|
|
2533
2387
|
|
|
2388
|
+
if email_converted_to_hem:
|
|
2389
|
+
self.fit_dropped_features.add(email_column)
|
|
2390
|
+
|
|
2534
2391
|
self.fit_generated_features = [f for f in self.fit_generated_features if f not in self.fit_dropped_features]
|
|
2535
2392
|
|
|
2536
2393
|
meaning_types = {
|
|
@@ -2544,12 +2401,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2544
2401
|
if eval_set is not None and len(eval_set) > 0:
|
|
2545
2402
|
meaning_types[EVAL_SET_INDEX] = FileColumnMeaningType.EVAL_SET_INDEX
|
|
2546
2403
|
|
|
2547
|
-
df = self.__add_fit_system_record_id(df, self.fit_search_keys, SYSTEM_RECORD_ID)
|
|
2548
|
-
|
|
2549
|
-
if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
|
|
2550
|
-
df = df.drop(columns=DateTimeSearchKeyConverter.DATETIME_COL)
|
|
2551
|
-
|
|
2552
|
-
meaning_types[SYSTEM_RECORD_ID] = FileColumnMeaningType.SYSTEM_RECORD_ID
|
|
2404
|
+
df = self.__add_fit_system_record_id(df, meaning_types, self.fit_search_keys, SYSTEM_RECORD_ID)
|
|
2553
2405
|
|
|
2554
2406
|
df = df.reset_index(drop=True).sort_values(by=SYSTEM_RECORD_ID).reset_index(drop=True)
|
|
2555
2407
|
|
|
@@ -2567,7 +2419,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2567
2419
|
rest_client=self.rest_client,
|
|
2568
2420
|
logger=self.logger,
|
|
2569
2421
|
)
|
|
2570
|
-
|
|
2422
|
+
if email_converted_to_hem:
|
|
2423
|
+
dataset.ignore_columns = [email_column]
|
|
2571
2424
|
|
|
2572
2425
|
self.passed_features = [
|
|
2573
2426
|
column for column, meaning_type in meaning_types.items() if meaning_type == FileColumnMeaningType.FEATURE
|
|
@@ -2956,7 +2809,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2956
2809
|
if DateTimeSearchKeyConverter.DATETIME_COL in X.columns:
|
|
2957
2810
|
date_column = DateTimeSearchKeyConverter.DATETIME_COL
|
|
2958
2811
|
else:
|
|
2959
|
-
date_column =
|
|
2812
|
+
date_column = FeaturesEnricher._get_date_column(search_keys)
|
|
2960
2813
|
sort_columns = [date_column] if date_column is not None else []
|
|
2961
2814
|
|
|
2962
2815
|
# Xy = pd.concat([X, y], axis=1)
|
|
@@ -3052,10 +2905,10 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3052
2905
|
|
|
3053
2906
|
do_without_pandas_limits(print_datasets_sample)
|
|
3054
2907
|
|
|
3055
|
-
maybe_date_col =
|
|
2908
|
+
maybe_date_col = self._get_date_column(self.search_keys)
|
|
3056
2909
|
if X is not None and maybe_date_col is not None and maybe_date_col in X.columns:
|
|
3057
2910
|
# TODO cast date column to single dtype
|
|
3058
|
-
date_converter = DateTimeSearchKeyConverter(maybe_date_col, self.date_format
|
|
2911
|
+
date_converter = DateTimeSearchKeyConverter(maybe_date_col, self.date_format)
|
|
3059
2912
|
converted_X = date_converter.convert(X)
|
|
3060
2913
|
min_date = converted_X[maybe_date_col].min()
|
|
3061
2914
|
max_date = converted_X[maybe_date_col].max()
|
|
@@ -3082,6 +2935,12 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3082
2935
|
|
|
3083
2936
|
return df
|
|
3084
2937
|
|
|
2938
|
+
@staticmethod
|
|
2939
|
+
def _get_date_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
|
|
2940
|
+
for col, t in search_keys.items():
|
|
2941
|
+
if t in [SearchKey.DATE, SearchKey.DATETIME]:
|
|
2942
|
+
return col
|
|
2943
|
+
|
|
3085
2944
|
@staticmethod
|
|
3086
2945
|
def _add_current_date_as_key(
|
|
3087
2946
|
df: pd.DataFrame, search_keys: Dict[str, SearchKey], logger: logging.Logger, bundle: ResourceBundle
|
|
@@ -3097,7 +2956,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3097
2956
|
logger.warning(msg)
|
|
3098
2957
|
df[FeaturesEnricher.CURRENT_DATE] = datetime.date.today()
|
|
3099
2958
|
search_keys[FeaturesEnricher.CURRENT_DATE] = SearchKey.DATE
|
|
3100
|
-
converter = DateTimeSearchKeyConverter(FeaturesEnricher.CURRENT_DATE,
|
|
2959
|
+
converter = DateTimeSearchKeyConverter(FeaturesEnricher.CURRENT_DATE, None, logger, bundle)
|
|
3101
2960
|
df = converter.convert(df)
|
|
3102
2961
|
return df
|
|
3103
2962
|
|
|
@@ -3125,37 +2984,17 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3125
2984
|
if len(cols) == 1:
|
|
3126
2985
|
return cols[0]
|
|
3127
2986
|
|
|
3128
|
-
@staticmethod
|
|
3129
|
-
def _get_ip_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
|
|
3130
|
-
cols = [col for col, t in search_keys.items() if t == SearchKey.IP]
|
|
3131
|
-
if len(cols) > 1:
|
|
3132
|
-
raise Exception("More than one ip column found after unnest")
|
|
3133
|
-
if len(cols) == 1:
|
|
3134
|
-
return cols[0]
|
|
3135
|
-
|
|
3136
2987
|
@staticmethod
|
|
3137
2988
|
def _get_phone_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
|
|
3138
2989
|
for col, t in search_keys.items():
|
|
3139
2990
|
if t == SearchKey.PHONE:
|
|
3140
2991
|
return col
|
|
3141
2992
|
|
|
3142
|
-
@staticmethod
|
|
3143
|
-
def _get_country_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
|
|
3144
|
-
for col, t in search_keys.items():
|
|
3145
|
-
if t == SearchKey.COUNTRY:
|
|
3146
|
-
return col
|
|
3147
|
-
|
|
3148
|
-
@staticmethod
|
|
3149
|
-
def _get_postal_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
|
|
3150
|
-
for col, t in search_keys.items():
|
|
3151
|
-
if t == SearchKey.POSTAL_CODE:
|
|
3152
|
-
return col
|
|
3153
|
-
|
|
3154
2993
|
def _explode_multiple_search_keys(
|
|
3155
|
-
self, df: pd.DataFrame, search_keys: Dict[str, SearchKey]
|
|
2994
|
+
self, df: pd.DataFrame, search_keys: Dict[str, SearchKey]
|
|
3156
2995
|
) -> Tuple[pd.DataFrame, Dict[str, List[str]]]:
|
|
3157
2996
|
# find groups of multiple search keys
|
|
3158
|
-
search_key_names_by_type: Dict[SearchKey,
|
|
2997
|
+
search_key_names_by_type: Dict[SearchKey, str] = {}
|
|
3159
2998
|
for key_name, key_type in search_keys.items():
|
|
3160
2999
|
search_key_names_by_type[key_type] = search_key_names_by_type.get(key_type, []) + [key_name]
|
|
3161
3000
|
search_key_names_by_type = {
|
|
@@ -3179,7 +3018,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3179
3018
|
del search_keys[old_key]
|
|
3180
3019
|
search_keys[new_search_key] = key_type
|
|
3181
3020
|
unnest_search_keys[new_search_key] = key_names
|
|
3182
|
-
columns_renaming[new_search_key] = new_search_key
|
|
3183
3021
|
|
|
3184
3022
|
df = pd.concat(exploded_dfs, ignore_index=True)
|
|
3185
3023
|
return df, unnest_search_keys
|
|
@@ -3187,7 +3025,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3187
3025
|
def __add_fit_system_record_id(
|
|
3188
3026
|
self,
|
|
3189
3027
|
df: pd.DataFrame,
|
|
3190
|
-
|
|
3028
|
+
meaning_types: Dict[str, FileColumnMeaningType],
|
|
3191
3029
|
search_keys: Dict[str, SearchKey],
|
|
3192
3030
|
id_name: str,
|
|
3193
3031
|
) -> pd.DataFrame:
|
|
@@ -3210,9 +3048,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3210
3048
|
]
|
|
3211
3049
|
if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
|
|
3212
3050
|
date_column = DateTimeSearchKeyConverter.DATETIME_COL
|
|
3213
|
-
sort_exclude_columns.append(
|
|
3051
|
+
sort_exclude_columns.append(self._get_date_column(search_keys))
|
|
3214
3052
|
else:
|
|
3215
|
-
date_column =
|
|
3053
|
+
date_column = self._get_date_column(search_keys)
|
|
3216
3054
|
sort_columns = [date_column] if date_column is not None else []
|
|
3217
3055
|
|
|
3218
3056
|
other_columns = sorted(
|
|
@@ -3221,6 +3059,13 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3221
3059
|
for c in df.columns
|
|
3222
3060
|
if c not in sort_columns and c not in sort_exclude_columns and df[c].nunique() > 1
|
|
3223
3061
|
]
|
|
3062
|
+
# [
|
|
3063
|
+
# sk
|
|
3064
|
+
# for sk, key_type in search_keys.items()
|
|
3065
|
+
# if key_type not in [SearchKey.DATE, SearchKey.DATETIME]
|
|
3066
|
+
# and sk in df.columns
|
|
3067
|
+
# and df[sk].nunique() > 1 # don't use constant keys for hash
|
|
3068
|
+
# ]
|
|
3224
3069
|
)
|
|
3225
3070
|
|
|
3226
3071
|
search_keys_hash = "search_keys_hash"
|
|
@@ -3233,6 +3078,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3233
3078
|
if search_keys_hash in df.columns:
|
|
3234
3079
|
df.drop(columns=search_keys_hash, inplace=True)
|
|
3235
3080
|
|
|
3081
|
+
if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
|
|
3082
|
+
df.drop(columns=DateTimeSearchKeyConverter.DATETIME_COL, inplace=True)
|
|
3083
|
+
|
|
3236
3084
|
df = df.reset_index(drop=True).reset_index()
|
|
3237
3085
|
# system_record_id saves correct order for fit
|
|
3238
3086
|
df = df.rename(columns={DEFAULT_INDEX: id_name})
|
|
@@ -3242,11 +3090,11 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3242
3090
|
df.index.name = original_index_name
|
|
3243
3091
|
df = df.sort_values(by=original_order_name).drop(columns=original_order_name)
|
|
3244
3092
|
|
|
3245
|
-
|
|
3246
|
-
|
|
3247
|
-
|
|
3248
|
-
|
|
3249
|
-
|
|
3093
|
+
meaning_types[id_name] = (
|
|
3094
|
+
FileColumnMeaningType.SYSTEM_RECORD_ID
|
|
3095
|
+
if id_name == SYSTEM_RECORD_ID
|
|
3096
|
+
else FileColumnMeaningType.ENTITY_SYSTEM_RECORD_ID
|
|
3097
|
+
)
|
|
3250
3098
|
return df
|
|
3251
3099
|
|
|
3252
3100
|
def __correct_target(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
@@ -3624,13 +3472,10 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3624
3472
|
for _, key_type in search_keys.items():
|
|
3625
3473
|
if not isinstance(key_type, SearchKey):
|
|
3626
3474
|
raise ValidationError(self.bundle.get("unsupported_type_of_search_key").format(key_type))
|
|
3627
|
-
|
|
3628
3475
|
valid_search_keys = {}
|
|
3629
3476
|
unsupported_search_keys = {
|
|
3630
3477
|
SearchKey.IP_RANGE_FROM,
|
|
3631
3478
|
SearchKey.IP_RANGE_TO,
|
|
3632
|
-
SearchKey.IPV6_RANGE_FROM,
|
|
3633
|
-
SearchKey.IPV6_RANGE_TO,
|
|
3634
3479
|
SearchKey.MSISDN_RANGE_FROM,
|
|
3635
3480
|
SearchKey.MSISDN_RANGE_TO,
|
|
3636
3481
|
# SearchKey.EMAIL_ONE_DOMAIN,
|
|
@@ -3720,7 +3565,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3720
3565
|
print(msg)
|
|
3721
3566
|
self.logger.warning(msg)
|
|
3722
3567
|
self.warning_counter.increment()
|
|
3723
|
-
# TODO maybe raise ValidationError
|
|
3724
3568
|
|
|
3725
3569
|
self.logger.info(f"Prepared search keys: {valid_search_keys}")
|
|
3726
3570
|
|