upgini 1.1.312a4__py3-none-any.whl → 1.1.313__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- upgini/__about__.py +1 -1
- upgini/autofe/all_operands.py +26 -7
- upgini/autofe/binary.py +95 -4
- upgini/autofe/date.py +26 -6
- upgini/autofe/feature.py +25 -11
- upgini/autofe/unary.py +7 -0
- upgini/dataset.py +386 -33
- upgini/features_enricher.py +142 -287
- upgini/metadata.py +1 -16
- upgini/normalizer/phone_normalizer.py +340 -0
- upgini/utils/country_utils.py +0 -16
- upgini/utils/datetime_utils.py +16 -38
- upgini/utils/email_utils.py +17 -49
- upgini/utils/ip_utils.py +1 -100
- upgini/utils/phone_utils.py +0 -345
- upgini/utils/postal_code_utils.py +0 -34
- {upgini-1.1.312a4.dist-info → upgini-1.1.313.dist-info}/METADATA +3 -1
- {upgini-1.1.312a4.dist-info → upgini-1.1.313.dist-info}/RECORD +20 -20
- {upgini-1.1.312a4.dist-info → upgini-1.1.313.dist-info}/WHEEL +1 -1
- upgini/normalizer/normalize_utils.py +0 -203
- {upgini-1.1.312a4.dist-info → upgini-1.1.313.dist-info}/licenses/LICENSE +0 -0
upgini/features_enricher.py
CHANGED
|
@@ -61,15 +61,11 @@ from upgini.metadata import (
|
|
|
61
61
|
SearchKey,
|
|
62
62
|
)
|
|
63
63
|
from upgini.metrics import EstimatorWrapper, validate_scoring_argument
|
|
64
|
-
from upgini.normalizer.normalize_utils import Normalizer
|
|
65
64
|
from upgini.resource_bundle import ResourceBundle, bundle, get_custom_bundle
|
|
66
65
|
from upgini.search_task import SearchTask
|
|
67
66
|
from upgini.spinner import Spinner
|
|
68
67
|
from upgini.utils import combine_search_keys, find_numbers_with_decimal_comma
|
|
69
|
-
from upgini.utils.country_utils import
|
|
70
|
-
CountrySearchKeyConverter,
|
|
71
|
-
CountrySearchKeyDetector,
|
|
72
|
-
)
|
|
68
|
+
from upgini.utils.country_utils import CountrySearchKeyDetector
|
|
73
69
|
from upgini.utils.custom_loss_utils import (
|
|
74
70
|
get_additional_params_custom_loss,
|
|
75
71
|
get_runtime_params_custom_loss,
|
|
@@ -91,15 +87,11 @@ from upgini.utils.display_utils import (
|
|
|
91
87
|
prepare_and_show_report,
|
|
92
88
|
show_request_quote_button,
|
|
93
89
|
)
|
|
94
|
-
from upgini.utils.email_utils import
|
|
90
|
+
from upgini.utils.email_utils import EmailSearchKeyConverter, EmailSearchKeyDetector
|
|
95
91
|
from upgini.utils.features_validator import FeaturesValidator
|
|
96
92
|
from upgini.utils.format import Format
|
|
97
|
-
from upgini.utils.
|
|
98
|
-
from upgini.utils.
|
|
99
|
-
from upgini.utils.postal_code_utils import (
|
|
100
|
-
PostalCodeSearchKeyConverter,
|
|
101
|
-
PostalCodeSearchKeyDetector,
|
|
102
|
-
)
|
|
93
|
+
from upgini.utils.phone_utils import PhoneSearchKeyDetector
|
|
94
|
+
from upgini.utils.postal_code_utils import PostalCodeSearchKeyDetector
|
|
103
95
|
|
|
104
96
|
try:
|
|
105
97
|
from upgini.utils.progress_bar import CustomProgressBar as ProgressBar
|
|
@@ -245,7 +237,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
245
237
|
|
|
246
238
|
self.passed_features: List[str] = []
|
|
247
239
|
self.df_with_original_index: Optional[pd.DataFrame] = None
|
|
248
|
-
self.fit_columns_renaming: Optional[Dict[str, str]] = None
|
|
249
240
|
self.country_added = False
|
|
250
241
|
self.fit_generated_features: List[str] = []
|
|
251
242
|
self.fit_dropped_features: Set[str] = set()
|
|
@@ -256,7 +247,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
256
247
|
self.eval_set: Optional[List[Tuple]] = None
|
|
257
248
|
self.autodetected_search_keys: Dict[str, SearchKey] = {}
|
|
258
249
|
self.imbalanced = False
|
|
259
|
-
self.__cached_sampled_datasets: Optional[Tuple[pd.DataFrame, pd.DataFrame, pd.Series, Dict, Dict
|
|
250
|
+
self.__cached_sampled_datasets: Optional[Tuple[pd.DataFrame, pd.DataFrame, pd.Series, Dict, Dict]] = None
|
|
260
251
|
|
|
261
252
|
validate_version(self.logger)
|
|
262
253
|
self.search_keys = search_keys or {}
|
|
@@ -715,7 +706,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
715
706
|
|
|
716
707
|
start_time = time.time()
|
|
717
708
|
try:
|
|
718
|
-
result
|
|
709
|
+
result = self.__inner_transform(
|
|
719
710
|
trace_id,
|
|
720
711
|
X,
|
|
721
712
|
exclude_features_sources=exclude_features_sources,
|
|
@@ -915,16 +906,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
915
906
|
search_keys,
|
|
916
907
|
groups,
|
|
917
908
|
_cv,
|
|
918
|
-
columns_renaming,
|
|
919
909
|
) = prepared_data
|
|
920
910
|
|
|
921
|
-
# rename cat_features
|
|
922
|
-
if cat_features:
|
|
923
|
-
for new_c, old_c in columns_renaming.items():
|
|
924
|
-
if old_c in cat_features:
|
|
925
|
-
cat_features.remove(old_c)
|
|
926
|
-
cat_features.append(new_c)
|
|
927
|
-
|
|
928
911
|
gc.collect()
|
|
929
912
|
|
|
930
913
|
print(self.bundle.get("metrics_start"))
|
|
@@ -937,7 +920,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
937
920
|
|
|
938
921
|
self._check_train_and_eval_target_distribution(y_sorted, fitting_eval_set_dict)
|
|
939
922
|
|
|
940
|
-
has_date =
|
|
923
|
+
has_date = self._get_date_column(search_keys) is not None
|
|
941
924
|
model_task_type = self.model_task_type or define_task(y_sorted, has_date, self.logger, silent=True)
|
|
942
925
|
|
|
943
926
|
wrapper = EstimatorWrapper.create(
|
|
@@ -1130,7 +1113,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1130
1113
|
)
|
|
1131
1114
|
|
|
1132
1115
|
uplift_col = self.bundle.get("quality_metrics_uplift_header")
|
|
1133
|
-
date_column =
|
|
1116
|
+
date_column = self._get_date_column(search_keys)
|
|
1134
1117
|
if (
|
|
1135
1118
|
uplift_col in metrics_df.columns
|
|
1136
1119
|
and (metrics_df[uplift_col] < 0).any()
|
|
@@ -1212,6 +1195,27 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1212
1195
|
def _has_paid_features(self, exclude_features_sources: Optional[List[str]]) -> bool:
|
|
1213
1196
|
return self._has_features_with_commercial_schema(CommercialSchema.PAID.value, exclude_features_sources)
|
|
1214
1197
|
|
|
1198
|
+
def _extend_x(self, x: pd.DataFrame, is_demo_dataset: bool) -> Tuple[pd.DataFrame, Dict[str, SearchKey]]:
|
|
1199
|
+
search_keys = self.search_keys.copy()
|
|
1200
|
+
search_keys = self.__prepare_search_keys(x, search_keys, is_demo_dataset, is_transform=True, silent_mode=True)
|
|
1201
|
+
|
|
1202
|
+
extended_X = x.copy()
|
|
1203
|
+
generated_features = []
|
|
1204
|
+
date_column = self._get_date_column(search_keys)
|
|
1205
|
+
if date_column is not None:
|
|
1206
|
+
converter = DateTimeSearchKeyConverter(date_column, self.date_format, self.logger, self.bundle)
|
|
1207
|
+
extended_X = converter.convert(extended_X, keep_time=True)
|
|
1208
|
+
generated_features.extend(converter.generated_features)
|
|
1209
|
+
email_column = self._get_email_column(search_keys)
|
|
1210
|
+
hem_column = self._get_hem_column(search_keys)
|
|
1211
|
+
if email_column:
|
|
1212
|
+
converter = EmailSearchKeyConverter(email_column, hem_column, search_keys, [], self.logger)
|
|
1213
|
+
extended_X = converter.convert(extended_X)
|
|
1214
|
+
generated_features.extend(converter.generated_features)
|
|
1215
|
+
generated_features = [f for f in generated_features if f in self.fit_generated_features]
|
|
1216
|
+
|
|
1217
|
+
return extended_X, search_keys
|
|
1218
|
+
|
|
1215
1219
|
def _is_input_same_as_fit(
|
|
1216
1220
|
self,
|
|
1217
1221
|
X: Union[pd.DataFrame, pd.Series, np.ndarray, None] = None,
|
|
@@ -1255,7 +1259,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1255
1259
|
groups = None
|
|
1256
1260
|
|
|
1257
1261
|
if not isinstance(_cv, BaseCrossValidator):
|
|
1258
|
-
date_column =
|
|
1262
|
+
date_column = self._get_date_column(search_keys)
|
|
1259
1263
|
date_series = X[date_column] if date_column is not None else None
|
|
1260
1264
|
_cv, groups = CVConfig(
|
|
1261
1265
|
_cv, date_series, self.random_state, self._search_task.get_shuffle_kfold(), group_columns=group_columns
|
|
@@ -1278,7 +1282,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1278
1282
|
|
|
1279
1283
|
def _get_client_cat_features(
|
|
1280
1284
|
self, estimator: Optional[Any], X: pd.DataFrame, search_keys: Dict[str, SearchKey]
|
|
1281
|
-
) ->
|
|
1285
|
+
) -> Optional[List[str]]:
|
|
1282
1286
|
cat_features = None
|
|
1283
1287
|
search_keys_for_metrics = []
|
|
1284
1288
|
if (
|
|
@@ -1338,15 +1342,11 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1338
1342
|
progress_bar,
|
|
1339
1343
|
progress_callback,
|
|
1340
1344
|
)
|
|
1341
|
-
X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys
|
|
1342
|
-
sampled_data
|
|
1343
|
-
)
|
|
1345
|
+
X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys = dataclasses.astuple(sampled_data)
|
|
1344
1346
|
|
|
1345
1347
|
excluding_search_keys = list(search_keys.keys())
|
|
1346
1348
|
if search_keys_for_metrics is not None and len(search_keys_for_metrics) > 0:
|
|
1347
|
-
for sk in excluding_search_keys
|
|
1348
|
-
if columns_renaming.get(sk) in search_keys_for_metrics:
|
|
1349
|
-
excluding_search_keys.remove(sk)
|
|
1349
|
+
excluding_search_keys = [sk for sk in excluding_search_keys if sk not in search_keys_for_metrics]
|
|
1350
1350
|
|
|
1351
1351
|
client_features = [
|
|
1352
1352
|
c
|
|
@@ -1392,7 +1392,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1392
1392
|
fitting_X = fitting_X.drop(columns=constant_columns, errors="ignore")
|
|
1393
1393
|
fitting_enriched_X = fitting_enriched_X.drop(columns=constant_columns, errors="ignore")
|
|
1394
1394
|
|
|
1395
|
-
# TODO maybe there is no more need for these convertions
|
|
1396
1395
|
# Remove datetime features
|
|
1397
1396
|
datetime_features = [
|
|
1398
1397
|
f for f in fitting_X.columns if is_datetime64_any_dtype(fitting_X[f]) or is_period_dtype(fitting_X[f])
|
|
@@ -1480,7 +1479,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1480
1479
|
search_keys,
|
|
1481
1480
|
groups,
|
|
1482
1481
|
cv,
|
|
1483
|
-
columns_renaming,
|
|
1484
1482
|
)
|
|
1485
1483
|
|
|
1486
1484
|
@dataclass
|
|
@@ -1490,7 +1488,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1490
1488
|
enriched_X: pd.DataFrame
|
|
1491
1489
|
eval_set_sampled_dict: Dict[int, Tuple[pd.DataFrame, pd.Series]]
|
|
1492
1490
|
search_keys: Dict[str, SearchKey]
|
|
1493
|
-
columns_renaming: Dict[str, str]
|
|
1494
1491
|
|
|
1495
1492
|
def _sample_data_for_metrics(
|
|
1496
1493
|
self,
|
|
@@ -1530,15 +1527,11 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1530
1527
|
)
|
|
1531
1528
|
|
|
1532
1529
|
def __get_sampled_cached_enriched(self, exclude_features_sources: Optional[List[str]]) -> _SampledDataForMetrics:
|
|
1533
|
-
X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys
|
|
1534
|
-
self.__cached_sampled_datasets
|
|
1535
|
-
)
|
|
1530
|
+
X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys = self.__cached_sampled_datasets
|
|
1536
1531
|
if exclude_features_sources:
|
|
1537
1532
|
enriched_X = enriched_X.drop(columns=exclude_features_sources, errors="ignore")
|
|
1538
1533
|
|
|
1539
|
-
return self.__mk_sampled_data_tuple(
|
|
1540
|
-
X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys, columns_renaming
|
|
1541
|
-
)
|
|
1534
|
+
return self.__mk_sampled_data_tuple(X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys)
|
|
1542
1535
|
|
|
1543
1536
|
def __sample_only_input(
|
|
1544
1537
|
self, validated_X: pd.DataFrame, validated_y: pd.Series, eval_set: Optional[List[tuple]], is_demo_dataset: bool
|
|
@@ -1556,28 +1549,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1556
1549
|
eval_xy[EVAL_SET_INDEX] = idx + 1
|
|
1557
1550
|
df = pd.concat([df, eval_xy])
|
|
1558
1551
|
|
|
1559
|
-
search_keys = self.search_keys.copy()
|
|
1560
|
-
search_keys = self.__prepare_search_keys(df, search_keys, is_demo_dataset, is_transform=True, silent_mode=True)
|
|
1561
|
-
|
|
1562
|
-
date_column = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
|
|
1563
|
-
generated_features = []
|
|
1564
|
-
if date_column is not None:
|
|
1565
|
-
converter = DateTimeSearchKeyConverter(
|
|
1566
|
-
date_column, self.date_format, self.logger, self.bundle, silent_mode=True
|
|
1567
|
-
)
|
|
1568
|
-
df = converter.convert(df, keep_time=True)
|
|
1569
|
-
generated_features = converter.generated_features
|
|
1570
|
-
|
|
1571
|
-
email_columns = SearchKey.find_all_keys(search_keys, SearchKey.EMAIL)
|
|
1572
|
-
if email_columns:
|
|
1573
|
-
generator = EmailDomainGenerator(email_columns)
|
|
1574
|
-
df = generator.generate(df)
|
|
1575
|
-
generated_features.extend(generator.generated_features)
|
|
1576
|
-
|
|
1577
|
-
normalizer = Normalizer(self.search_keys, generated_features, self.bundle, self.logger, self.warning_counter)
|
|
1578
|
-
df = normalizer.normalize(df)
|
|
1579
|
-
columns_renaming = normalizer.columns_renaming
|
|
1580
|
-
|
|
1581
1552
|
df = clean_full_duplicates(df, logger=self.logger, silent=True, bundle=self.bundle)
|
|
1582
1553
|
|
|
1583
1554
|
num_samples = _num_samples(df)
|
|
@@ -1590,34 +1561,24 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1590
1561
|
self.logger.info(f"Downsampling from {num_samples} to {sample_rows}")
|
|
1591
1562
|
df = df.sample(n=sample_rows, random_state=self.random_state)
|
|
1592
1563
|
|
|
1593
|
-
|
|
1594
|
-
|
|
1595
|
-
df = df.drop(columns=DateTimeSearchKeyConverter.DATETIME_COL)
|
|
1564
|
+
df_extended, search_keys = self._extend_x(df, is_demo_dataset)
|
|
1565
|
+
df_extended = self.__add_fit_system_record_id(df_extended, {}, search_keys, SYSTEM_RECORD_ID)
|
|
1596
1566
|
|
|
1597
|
-
train_df =
|
|
1567
|
+
train_df = df_extended.query(f"{EVAL_SET_INDEX} == 0") if eval_set is not None else df_extended
|
|
1598
1568
|
X_sampled = train_df.drop(columns=[TARGET, EVAL_SET_INDEX], errors="ignore")
|
|
1599
1569
|
y_sampled = train_df[TARGET].copy()
|
|
1600
1570
|
enriched_X = X_sampled
|
|
1601
1571
|
|
|
1602
1572
|
if eval_set is not None:
|
|
1603
1573
|
for idx in range(len(eval_set)):
|
|
1604
|
-
eval_xy_sampled =
|
|
1574
|
+
eval_xy_sampled = df_extended.query(f"{EVAL_SET_INDEX} == {idx + 1}")
|
|
1605
1575
|
eval_X_sampled = eval_xy_sampled.drop(columns=[TARGET, EVAL_SET_INDEX], errors="ignore")
|
|
1606
1576
|
eval_y_sampled = eval_xy_sampled[TARGET].copy()
|
|
1607
1577
|
enriched_eval_X = eval_X_sampled
|
|
1608
1578
|
eval_set_sampled_dict[idx] = (eval_X_sampled, enriched_eval_X, eval_y_sampled)
|
|
1609
|
-
self.__cached_sampled_datasets = (
|
|
1610
|
-
X_sampled,
|
|
1611
|
-
y_sampled,
|
|
1612
|
-
enriched_X,
|
|
1613
|
-
eval_set_sampled_dict,
|
|
1614
|
-
search_keys,
|
|
1615
|
-
columns_renaming,
|
|
1616
|
-
)
|
|
1579
|
+
self.__cached_sampled_datasets = (X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys)
|
|
1617
1580
|
|
|
1618
|
-
return self.__mk_sampled_data_tuple(
|
|
1619
|
-
X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys, columns_renaming
|
|
1620
|
-
)
|
|
1581
|
+
return self.__mk_sampled_data_tuple(X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys)
|
|
1621
1582
|
|
|
1622
1583
|
def __sample_balanced(
|
|
1623
1584
|
self,
|
|
@@ -1629,7 +1590,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1629
1590
|
search_keys = self.fit_search_keys
|
|
1630
1591
|
|
|
1631
1592
|
rows_to_drop = None
|
|
1632
|
-
has_date =
|
|
1593
|
+
has_date = self._get_date_column(search_keys) is not None
|
|
1633
1594
|
task_type = self.model_task_type or define_task(
|
|
1634
1595
|
self.df_with_original_index[TARGET], has_date, self.logger, silent=True
|
|
1635
1596
|
)
|
|
@@ -1683,18 +1644,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1683
1644
|
enriched_eval_X = enriched_eval_sets[idx + 1].drop(columns=[TARGET, EVAL_SET_INDEX])
|
|
1684
1645
|
eval_set_sampled_dict[idx] = (eval_X_sampled, enriched_eval_X, eval_y_sampled)
|
|
1685
1646
|
|
|
1686
|
-
self.__cached_sampled_datasets = (
|
|
1687
|
-
X_sampled,
|
|
1688
|
-
y_sampled,
|
|
1689
|
-
enriched_X,
|
|
1690
|
-
eval_set_sampled_dict,
|
|
1691
|
-
search_keys,
|
|
1692
|
-
self.fit_columns_renaming,
|
|
1693
|
-
)
|
|
1647
|
+
self.__cached_sampled_datasets = (X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys)
|
|
1694
1648
|
|
|
1695
|
-
return self.__mk_sampled_data_tuple(
|
|
1696
|
-
X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys, self.fit_columns_renaming
|
|
1697
|
-
)
|
|
1649
|
+
return self.__mk_sampled_data_tuple(X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys)
|
|
1698
1650
|
|
|
1699
1651
|
def __sample_imbalanced(
|
|
1700
1652
|
self,
|
|
@@ -1734,7 +1686,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1734
1686
|
tmp_target_name = "__target"
|
|
1735
1687
|
df = df.rename(columns={TARGET: tmp_target_name})
|
|
1736
1688
|
|
|
1737
|
-
enriched_df
|
|
1689
|
+
enriched_df = self.__inner_transform(
|
|
1738
1690
|
trace_id,
|
|
1739
1691
|
df,
|
|
1740
1692
|
exclude_features_sources=exclude_features_sources,
|
|
@@ -1782,7 +1734,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1782
1734
|
tmp_target_name = "__target"
|
|
1783
1735
|
df = df.rename(columns={TARGET: tmp_target_name})
|
|
1784
1736
|
|
|
1785
|
-
enriched_Xy
|
|
1737
|
+
enriched_Xy = self.__inner_transform(
|
|
1786
1738
|
trace_id,
|
|
1787
1739
|
df,
|
|
1788
1740
|
exclude_features_sources=exclude_features_sources,
|
|
@@ -1807,18 +1759,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1807
1759
|
y_sampled = enriched_Xy[TARGET].copy()
|
|
1808
1760
|
enriched_X = enriched_Xy.drop(columns=TARGET)
|
|
1809
1761
|
|
|
1810
|
-
self.__cached_sampled_datasets = (
|
|
1811
|
-
X_sampled,
|
|
1812
|
-
y_sampled,
|
|
1813
|
-
enriched_X,
|
|
1814
|
-
eval_set_sampled_dict,
|
|
1815
|
-
self.search_keys,
|
|
1816
|
-
columns_renaming,
|
|
1817
|
-
)
|
|
1762
|
+
self.__cached_sampled_datasets = (X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, self.search_keys)
|
|
1818
1763
|
|
|
1819
|
-
return self.__mk_sampled_data_tuple(
|
|
1820
|
-
X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, self.search_keys, columns_renaming
|
|
1821
|
-
)
|
|
1764
|
+
return self.__mk_sampled_data_tuple(X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, self.search_keys)
|
|
1822
1765
|
|
|
1823
1766
|
def __mk_sampled_data_tuple(
|
|
1824
1767
|
self,
|
|
@@ -1827,7 +1770,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1827
1770
|
enriched_X: pd.DataFrame,
|
|
1828
1771
|
eval_set_sampled_dict: Dict,
|
|
1829
1772
|
search_keys: Dict,
|
|
1830
|
-
columns_renaming: Dict[str, str],
|
|
1831
1773
|
):
|
|
1832
1774
|
search_keys = {k: v for k, v in search_keys.items() if k in X_sampled.columns.to_list()}
|
|
1833
1775
|
return FeaturesEnricher._SampledDataForMetrics(
|
|
@@ -1836,7 +1778,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1836
1778
|
enriched_X=enriched_X,
|
|
1837
1779
|
eval_set_sampled_dict=eval_set_sampled_dict,
|
|
1838
1780
|
search_keys=search_keys,
|
|
1839
|
-
columns_renaming=columns_renaming,
|
|
1840
1781
|
)
|
|
1841
1782
|
|
|
1842
1783
|
def get_search_id(self) -> Optional[str]:
|
|
@@ -1925,7 +1866,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1925
1866
|
progress_bar: Optional[ProgressBar] = None,
|
|
1926
1867
|
progress_callback: Optional[Callable[[SearchProgress], Any]] = None,
|
|
1927
1868
|
add_fit_system_record_id: bool = False,
|
|
1928
|
-
) ->
|
|
1869
|
+
) -> pd.DataFrame:
|
|
1929
1870
|
if self._search_task is None:
|
|
1930
1871
|
raise NotFittedError(self.bundle.get("transform_unfitted_enricher"))
|
|
1931
1872
|
|
|
@@ -1938,13 +1879,13 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1938
1879
|
|
|
1939
1880
|
if len(self.feature_names_) == 0:
|
|
1940
1881
|
self.logger.warning(self.bundle.get("no_important_features_for_transform"))
|
|
1941
|
-
return X
|
|
1882
|
+
return X
|
|
1942
1883
|
|
|
1943
1884
|
if self._has_paid_features(exclude_features_sources):
|
|
1944
1885
|
msg = self.bundle.get("transform_with_paid_features")
|
|
1945
1886
|
self.logger.warning(msg)
|
|
1946
1887
|
self.__display_support_link(msg)
|
|
1947
|
-
return None
|
|
1888
|
+
return None
|
|
1948
1889
|
|
|
1949
1890
|
if not metrics_calculation:
|
|
1950
1891
|
transform_usage = self.rest_client.get_current_transform_usage(trace_id)
|
|
@@ -1955,7 +1896,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1955
1896
|
self.logger.warning(msg)
|
|
1956
1897
|
print(msg)
|
|
1957
1898
|
show_request_quote_button()
|
|
1958
|
-
return None
|
|
1899
|
+
return None
|
|
1959
1900
|
else:
|
|
1960
1901
|
msg = self.bundle.get("transform_usage_info").format(
|
|
1961
1902
|
transform_usage.limit, transform_usage.transformed_rows
|
|
@@ -1993,11 +1934,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1993
1934
|
df = self.__add_country_code(df, search_keys)
|
|
1994
1935
|
|
|
1995
1936
|
generated_features = []
|
|
1996
|
-
date_column =
|
|
1937
|
+
date_column = self._get_date_column(search_keys)
|
|
1997
1938
|
if date_column is not None:
|
|
1998
|
-
converter = DateTimeSearchKeyConverter(
|
|
1999
|
-
date_column, self.date_format, self.logger, bundle=self.bundle, silent_mode=silent_mode
|
|
2000
|
-
)
|
|
1939
|
+
converter = DateTimeSearchKeyConverter(date_column, self.date_format, self.logger, bundle=self.bundle)
|
|
2001
1940
|
df = converter.convert(df)
|
|
2002
1941
|
self.logger.info(f"Date column after convertion: {df[date_column]}")
|
|
2003
1942
|
generated_features.extend(converter.generated_features)
|
|
@@ -2006,93 +1945,61 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2006
1945
|
if self.add_date_if_missing:
|
|
2007
1946
|
df = self._add_current_date_as_key(df, search_keys, self.logger, self.bundle)
|
|
2008
1947
|
|
|
2009
|
-
email_columns = SearchKey.find_all_keys(search_keys, SearchKey.EMAIL)
|
|
2010
|
-
if email_columns:
|
|
2011
|
-
generator = EmailDomainGenerator(email_columns)
|
|
2012
|
-
df = generator.generate(df)
|
|
2013
|
-
generated_features.extend(generator.generated_features)
|
|
2014
|
-
|
|
2015
|
-
normalizer = Normalizer(
|
|
2016
|
-
search_keys, generated_features, self.bundle, self.logger, self.warning_counter, silent_mode
|
|
2017
|
-
)
|
|
2018
|
-
df = normalizer.normalize(df)
|
|
2019
|
-
columns_renaming = normalizer.columns_renaming
|
|
2020
|
-
|
|
2021
1948
|
# Don't pass all features in backend on transform
|
|
1949
|
+
original_features_for_transform = []
|
|
2022
1950
|
runtime_parameters = self._get_copy_of_runtime_parameters()
|
|
2023
|
-
|
|
2024
|
-
if len(
|
|
2025
|
-
|
|
1951
|
+
features_not_to_pass = [column for column in df.columns if column not in search_keys.keys()]
|
|
1952
|
+
if len(features_not_to_pass) > 0:
|
|
1953
|
+
# Pass only features that need for transform
|
|
1954
|
+
features_for_transform = self._search_task.get_features_for_transform()
|
|
1955
|
+
if features_for_transform is not None and len(features_for_transform) > 0:
|
|
1956
|
+
file_metadata = self._search_task.get_file_metadata(trace_id)
|
|
1957
|
+
original_features_for_transform = [
|
|
1958
|
+
c.originalName or c.name for c in file_metadata.columns if c.name in features_for_transform
|
|
1959
|
+
]
|
|
1960
|
+
|
|
1961
|
+
runtime_parameters.properties["features_for_embeddings"] = ",".join(features_for_transform)
|
|
2026
1962
|
|
|
2027
|
-
columns_for_system_record_id = sorted(list(search_keys.keys()) +
|
|
1963
|
+
columns_for_system_record_id = sorted(list(search_keys.keys()) + (original_features_for_transform))
|
|
2028
1964
|
|
|
2029
1965
|
df[ENTITY_SYSTEM_RECORD_ID] = pd.util.hash_pandas_object(
|
|
2030
1966
|
df[columns_for_system_record_id], index=False
|
|
2031
1967
|
).astype("Float64")
|
|
2032
1968
|
|
|
2033
1969
|
# Explode multiple search keys
|
|
2034
|
-
df, unnest_search_keys = self._explode_multiple_search_keys(df, search_keys
|
|
1970
|
+
df, unnest_search_keys = self._explode_multiple_search_keys(df, search_keys)
|
|
2035
1971
|
|
|
2036
1972
|
email_column = self._get_email_column(search_keys)
|
|
2037
1973
|
hem_column = self._get_hem_column(search_keys)
|
|
1974
|
+
email_converted_to_hem = False
|
|
2038
1975
|
if email_column:
|
|
2039
1976
|
converter = EmailSearchKeyConverter(
|
|
2040
|
-
email_column,
|
|
2041
|
-
hem_column,
|
|
2042
|
-
search_keys,
|
|
2043
|
-
columns_renaming,
|
|
2044
|
-
list(unnest_search_keys.keys()),
|
|
2045
|
-
self.logger,
|
|
1977
|
+
email_column, hem_column, search_keys, list(unnest_search_keys.keys()), self.logger
|
|
2046
1978
|
)
|
|
2047
1979
|
df = converter.convert(df)
|
|
2048
|
-
|
|
2049
|
-
|
|
2050
|
-
if ip_column:
|
|
2051
|
-
converter = IpSearchKeyConverter(
|
|
2052
|
-
ip_column,
|
|
2053
|
-
search_keys,
|
|
2054
|
-
columns_renaming,
|
|
2055
|
-
list(unnest_search_keys.keys()),
|
|
2056
|
-
self.bundle,
|
|
2057
|
-
self.logger,
|
|
2058
|
-
)
|
|
2059
|
-
df = converter.convert(df)
|
|
2060
|
-
|
|
2061
|
-
phone_column = self._get_phone_column(search_keys)
|
|
2062
|
-
country_column = self._get_country_column(search_keys)
|
|
2063
|
-
if phone_column:
|
|
2064
|
-
converter = PhoneSearchKeyConverter(phone_column, country_column)
|
|
2065
|
-
df = converter.convert(df)
|
|
2066
|
-
|
|
2067
|
-
if country_column:
|
|
2068
|
-
converter = CountrySearchKeyConverter(country_column)
|
|
2069
|
-
df = converter.convert(df)
|
|
2070
|
-
|
|
2071
|
-
postal_code = self._get_postal_column(search_keys)
|
|
2072
|
-
if postal_code:
|
|
2073
|
-
converter = PostalCodeSearchKeyConverter(postal_code)
|
|
2074
|
-
df = converter.convert(df)
|
|
2075
|
-
|
|
1980
|
+
generated_features.extend(converter.generated_features)
|
|
1981
|
+
email_converted_to_hem = converter.email_converted_to_hem
|
|
2076
1982
|
generated_features = [f for f in generated_features if f in self.fit_generated_features]
|
|
2077
1983
|
|
|
2078
1984
|
meaning_types = {col: key.value for col, key in search_keys.items()}
|
|
2079
|
-
for
|
|
1985
|
+
# non_keys_columns = [column for column in df.columns if column not in search_keys.keys()]
|
|
1986
|
+
for col in original_features_for_transform:
|
|
2080
1987
|
meaning_types[col] = FileColumnMeaningType.FEATURE
|
|
2081
|
-
features_not_to_pass = [
|
|
2082
|
-
|
|
2083
|
-
|
|
2084
|
-
|
|
2085
|
-
|
|
1988
|
+
features_not_to_pass = [column for column in features_not_to_pass if column not in search_keys.keys()]
|
|
1989
|
+
|
|
1990
|
+
if email_converted_to_hem:
|
|
1991
|
+
features_not_to_pass.append(email_column)
|
|
1992
|
+
|
|
1993
|
+
features_not_to_pass = [c for c in features_not_to_pass if c not in original_features_for_transform]
|
|
1994
|
+
columns_for_system_record_id = sorted(list(search_keys.keys()) + (original_features_for_transform))
|
|
2086
1995
|
|
|
2087
1996
|
if add_fit_system_record_id:
|
|
2088
|
-
df = self.__add_fit_system_record_id(df, search_keys, SYSTEM_RECORD_ID)
|
|
2089
|
-
if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
|
|
2090
|
-
df = df.drop(columns=DateTimeSearchKeyConverter.DATETIME_COL)
|
|
1997
|
+
df = self.__add_fit_system_record_id(df, {}, search_keys, SYSTEM_RECORD_ID)
|
|
2091
1998
|
df = df.rename(columns={SYSTEM_RECORD_ID: SORT_ID})
|
|
2092
1999
|
features_not_to_pass.append(SORT_ID)
|
|
2093
2000
|
|
|
2094
|
-
|
|
2095
|
-
|
|
2001
|
+
columns_for_system_record_id = sorted(list(search_keys.keys()) + (original_features_for_transform or []))
|
|
2002
|
+
|
|
2096
2003
|
df[SYSTEM_RECORD_ID] = pd.util.hash_pandas_object(df[columns_for_system_record_id], index=False).astype(
|
|
2097
2004
|
"Float64"
|
|
2098
2005
|
)
|
|
@@ -2128,7 +2035,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2128
2035
|
rest_client=self.rest_client,
|
|
2129
2036
|
logger=self.logger,
|
|
2130
2037
|
)
|
|
2131
|
-
|
|
2038
|
+
if email_converted_to_hem:
|
|
2039
|
+
dataset.ignore_columns = [email_column]
|
|
2132
2040
|
|
|
2133
2041
|
if max_features is not None or importance_threshold is not None:
|
|
2134
2042
|
exclude_features_sources = list(
|
|
@@ -2230,7 +2138,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2230
2138
|
if add_fit_system_record_id:
|
|
2231
2139
|
result = result.rename(columns={SORT_ID: SYSTEM_RECORD_ID})
|
|
2232
2140
|
|
|
2233
|
-
return result
|
|
2141
|
+
return result
|
|
2234
2142
|
|
|
2235
2143
|
def _get_excluded_features(self, max_features: Optional[int], importance_threshold: Optional[float]) -> List[str]:
|
|
2236
2144
|
features_info = self._internal_features_info
|
|
@@ -2331,9 +2239,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2331
2239
|
self.df_with_original_index = None
|
|
2332
2240
|
self.__cached_sampled_datasets = None
|
|
2333
2241
|
self.metrics = None
|
|
2334
|
-
self.fit_columns_renaming = None
|
|
2335
|
-
self.fit_dropped_features = set()
|
|
2336
|
-
self.fit_generated_features = []
|
|
2337
2242
|
|
|
2338
2243
|
validated_X = self._validate_X(X)
|
|
2339
2244
|
validated_y = self._validate_y(validated_X, y)
|
|
@@ -2380,10 +2285,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2380
2285
|
self.fit_search_keys = self.search_keys.copy()
|
|
2381
2286
|
self.fit_search_keys = self.__prepare_search_keys(validated_X, self.fit_search_keys, is_demo_dataset)
|
|
2382
2287
|
|
|
2383
|
-
maybe_date_column =
|
|
2288
|
+
maybe_date_column = self._get_date_column(self.fit_search_keys)
|
|
2384
2289
|
has_date = maybe_date_column is not None
|
|
2385
2290
|
model_task_type = self.model_task_type or define_task(validated_y, has_date, self.logger)
|
|
2386
|
-
|
|
2387
2291
|
self._validate_binary_observations(validated_y, model_task_type)
|
|
2388
2292
|
|
|
2389
2293
|
self.runtime_parameters = get_runtime_params_custom_loss(
|
|
@@ -2413,13 +2317,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2413
2317
|
self.fit_generated_features = []
|
|
2414
2318
|
|
|
2415
2319
|
if has_date:
|
|
2416
|
-
converter = DateTimeSearchKeyConverter(
|
|
2417
|
-
maybe_date_column,
|
|
2418
|
-
self.date_format,
|
|
2419
|
-
self.logger,
|
|
2420
|
-
bundle=self.bundle,
|
|
2421
|
-
warnings_counter=self.warning_counter,
|
|
2422
|
-
)
|
|
2320
|
+
converter = DateTimeSearchKeyConverter(maybe_date_column, self.date_format, self.logger, bundle=self.bundle)
|
|
2423
2321
|
df = converter.convert(df, keep_time=True)
|
|
2424
2322
|
self.logger.info(f"Date column after convertion: {df[maybe_date_column]}")
|
|
2425
2323
|
self.fit_generated_features.extend(converter.generated_features)
|
|
@@ -2428,14 +2326,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2428
2326
|
if self.add_date_if_missing:
|
|
2429
2327
|
df = self._add_current_date_as_key(df, self.fit_search_keys, self.logger, self.bundle)
|
|
2430
2328
|
|
|
2431
|
-
email_columns = SearchKey.find_all_keys(self.fit_search_keys, SearchKey.EMAIL)
|
|
2432
|
-
if email_columns:
|
|
2433
|
-
generator = EmailDomainGenerator(
|
|
2434
|
-
email_columns
|
|
2435
|
-
)
|
|
2436
|
-
df = generator.generate(df)
|
|
2437
|
-
self.fit_generated_features.extend(generator.generated_features)
|
|
2438
|
-
|
|
2439
2329
|
# Checks that need validated date
|
|
2440
2330
|
validate_dates_distribution(df, self.fit_search_keys, self.logger, self.bundle, self.warning_counter)
|
|
2441
2331
|
|
|
@@ -2444,12 +2334,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2444
2334
|
|
|
2445
2335
|
self.__adjust_cv(df, maybe_date_column, model_task_type)
|
|
2446
2336
|
|
|
2447
|
-
|
|
2448
|
-
self.fit_search_keys, self.fit_generated_features, self.bundle, self.logger, self.warning_counter
|
|
2449
|
-
)
|
|
2450
|
-
df = normalizer.normalize(df)
|
|
2451
|
-
columns_renaming = normalizer.columns_renaming
|
|
2452
|
-
self.fit_columns_renaming = columns_renaming
|
|
2337
|
+
# TODO normalize and convert all columns
|
|
2453
2338
|
|
|
2454
2339
|
df = remove_fintech_duplicates(
|
|
2455
2340
|
df, self.fit_search_keys, date_format=self.date_format, logger=self.logger, bundle=self.bundle
|
|
@@ -2457,58 +2342,38 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2457
2342
|
df = clean_full_duplicates(df, self.logger, bundle=self.bundle)
|
|
2458
2343
|
|
|
2459
2344
|
# Explode multiple search keys
|
|
2460
|
-
|
|
2345
|
+
non_feature_columns = [self.TARGET_NAME, EVAL_SET_INDEX] + list(self.fit_search_keys.keys())
|
|
2346
|
+
meaning_types = {
|
|
2347
|
+
**{col: key.value for col, key in self.fit_search_keys.items()},
|
|
2348
|
+
**{str(c): FileColumnMeaningType.FEATURE for c in df.columns if c not in non_feature_columns},
|
|
2349
|
+
}
|
|
2350
|
+
meaning_types[self.TARGET_NAME] = FileColumnMeaningType.TARGET
|
|
2351
|
+
if eval_set is not None and len(eval_set) > 0:
|
|
2352
|
+
meaning_types[EVAL_SET_INDEX] = FileColumnMeaningType.EVAL_SET_INDEX
|
|
2353
|
+
df = self.__add_fit_system_record_id(df, meaning_types, self.fit_search_keys, ENTITY_SYSTEM_RECORD_ID)
|
|
2461
2354
|
|
|
2462
2355
|
# TODO check that this is correct for enrichment
|
|
2463
2356
|
self.df_with_original_index = df.copy()
|
|
2464
|
-
# TODO check maybe need to drop _time column from df_with_original_index
|
|
2465
2357
|
|
|
2466
|
-
df, unnest_search_keys = self._explode_multiple_search_keys(df, self.fit_search_keys
|
|
2358
|
+
df, unnest_search_keys = self._explode_multiple_search_keys(df, self.fit_search_keys)
|
|
2467
2359
|
|
|
2468
2360
|
# Convert EMAIL to HEM after unnesting to do it only with one column
|
|
2469
2361
|
email_column = self._get_email_column(self.fit_search_keys)
|
|
2470
2362
|
hem_column = self._get_hem_column(self.fit_search_keys)
|
|
2363
|
+
email_converted_to_hem = False
|
|
2471
2364
|
if email_column:
|
|
2472
2365
|
converter = EmailSearchKeyConverter(
|
|
2473
|
-
email_column,
|
|
2474
|
-
hem_column,
|
|
2475
|
-
self.fit_search_keys,
|
|
2476
|
-
columns_renaming,
|
|
2477
|
-
list(unnest_search_keys.keys()),
|
|
2478
|
-
self.logger,
|
|
2479
|
-
)
|
|
2480
|
-
df = converter.convert(df)
|
|
2481
|
-
|
|
2482
|
-
ip_column = self._get_ip_column(self.fit_search_keys)
|
|
2483
|
-
if ip_column:
|
|
2484
|
-
converter = IpSearchKeyConverter(
|
|
2485
|
-
ip_column,
|
|
2486
|
-
self.fit_search_keys,
|
|
2487
|
-
columns_renaming,
|
|
2488
|
-
list(unnest_search_keys.keys()),
|
|
2489
|
-
self.bundle,
|
|
2490
|
-
self.logger,
|
|
2366
|
+
email_column, hem_column, self.fit_search_keys, list(unnest_search_keys.keys()), self.logger
|
|
2491
2367
|
)
|
|
2492
2368
|
df = converter.convert(df)
|
|
2493
|
-
|
|
2494
|
-
|
|
2495
|
-
country_column = self._get_country_column(self.fit_search_keys)
|
|
2496
|
-
if phone_column:
|
|
2497
|
-
converter = PhoneSearchKeyConverter(phone_column, country_column)
|
|
2498
|
-
df = converter.convert(df)
|
|
2499
|
-
|
|
2500
|
-
if country_column:
|
|
2501
|
-
converter = CountrySearchKeyConverter(country_column)
|
|
2502
|
-
df = converter.convert(df)
|
|
2503
|
-
|
|
2504
|
-
postal_code = self._get_postal_column(self.fit_search_keys)
|
|
2505
|
-
if postal_code:
|
|
2506
|
-
converter = PostalCodeSearchKeyConverter(postal_code)
|
|
2507
|
-
df = converter.convert(df)
|
|
2369
|
+
self.fit_generated_features.extend(converter.generated_features)
|
|
2370
|
+
email_converted_to_hem = converter.email_converted_to_hem
|
|
2508
2371
|
|
|
2509
2372
|
non_feature_columns = [self.TARGET_NAME, EVAL_SET_INDEX, ENTITY_SYSTEM_RECORD_ID, SEARCH_KEY_UNNEST] + list(
|
|
2510
2373
|
self.fit_search_keys.keys()
|
|
2511
2374
|
)
|
|
2375
|
+
if email_converted_to_hem:
|
|
2376
|
+
non_feature_columns.append(email_column)
|
|
2512
2377
|
if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
|
|
2513
2378
|
non_feature_columns.append(DateTimeSearchKeyConverter.DATETIME_COL)
|
|
2514
2379
|
|
|
@@ -2520,6 +2385,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2520
2385
|
self.fit_dropped_features.update(features_to_drop)
|
|
2521
2386
|
df = df.drop(columns=features_to_drop)
|
|
2522
2387
|
|
|
2388
|
+
if email_converted_to_hem:
|
|
2389
|
+
self.fit_dropped_features.add(email_column)
|
|
2390
|
+
|
|
2523
2391
|
self.fit_generated_features = [f for f in self.fit_generated_features if f not in self.fit_dropped_features]
|
|
2524
2392
|
|
|
2525
2393
|
meaning_types = {
|
|
@@ -2533,12 +2401,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2533
2401
|
if eval_set is not None and len(eval_set) > 0:
|
|
2534
2402
|
meaning_types[EVAL_SET_INDEX] = FileColumnMeaningType.EVAL_SET_INDEX
|
|
2535
2403
|
|
|
2536
|
-
df = self.__add_fit_system_record_id(df, self.fit_search_keys, SYSTEM_RECORD_ID)
|
|
2537
|
-
|
|
2538
|
-
if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
|
|
2539
|
-
df = df.drop(columns=DateTimeSearchKeyConverter.DATETIME_COL)
|
|
2540
|
-
|
|
2541
|
-
meaning_types[SYSTEM_RECORD_ID] = FileColumnMeaningType.SYSTEM_RECORD_ID
|
|
2404
|
+
df = self.__add_fit_system_record_id(df, meaning_types, self.fit_search_keys, SYSTEM_RECORD_ID)
|
|
2542
2405
|
|
|
2543
2406
|
df = df.reset_index(drop=True).sort_values(by=SYSTEM_RECORD_ID).reset_index(drop=True)
|
|
2544
2407
|
|
|
@@ -2556,7 +2419,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2556
2419
|
rest_client=self.rest_client,
|
|
2557
2420
|
logger=self.logger,
|
|
2558
2421
|
)
|
|
2559
|
-
|
|
2422
|
+
if email_converted_to_hem:
|
|
2423
|
+
dataset.ignore_columns = [email_column]
|
|
2560
2424
|
|
|
2561
2425
|
self.passed_features = [
|
|
2562
2426
|
column for column, meaning_type in meaning_types.items() if meaning_type == FileColumnMeaningType.FEATURE
|
|
@@ -2945,7 +2809,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2945
2809
|
if DateTimeSearchKeyConverter.DATETIME_COL in X.columns:
|
|
2946
2810
|
date_column = DateTimeSearchKeyConverter.DATETIME_COL
|
|
2947
2811
|
else:
|
|
2948
|
-
date_column =
|
|
2812
|
+
date_column = FeaturesEnricher._get_date_column(search_keys)
|
|
2949
2813
|
sort_columns = [date_column] if date_column is not None else []
|
|
2950
2814
|
|
|
2951
2815
|
# Xy = pd.concat([X, y], axis=1)
|
|
@@ -3041,10 +2905,10 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3041
2905
|
|
|
3042
2906
|
do_without_pandas_limits(print_datasets_sample)
|
|
3043
2907
|
|
|
3044
|
-
maybe_date_col =
|
|
2908
|
+
maybe_date_col = self._get_date_column(self.search_keys)
|
|
3045
2909
|
if X is not None and maybe_date_col is not None and maybe_date_col in X.columns:
|
|
3046
2910
|
# TODO cast date column to single dtype
|
|
3047
|
-
date_converter = DateTimeSearchKeyConverter(maybe_date_col, self.date_format
|
|
2911
|
+
date_converter = DateTimeSearchKeyConverter(maybe_date_col, self.date_format)
|
|
3048
2912
|
converted_X = date_converter.convert(X)
|
|
3049
2913
|
min_date = converted_X[maybe_date_col].min()
|
|
3050
2914
|
max_date = converted_X[maybe_date_col].max()
|
|
@@ -3071,6 +2935,12 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3071
2935
|
|
|
3072
2936
|
return df
|
|
3073
2937
|
|
|
2938
|
+
@staticmethod
|
|
2939
|
+
def _get_date_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
|
|
2940
|
+
for col, t in search_keys.items():
|
|
2941
|
+
if t in [SearchKey.DATE, SearchKey.DATETIME]:
|
|
2942
|
+
return col
|
|
2943
|
+
|
|
3074
2944
|
@staticmethod
|
|
3075
2945
|
def _add_current_date_as_key(
|
|
3076
2946
|
df: pd.DataFrame, search_keys: Dict[str, SearchKey], logger: logging.Logger, bundle: ResourceBundle
|
|
@@ -3086,7 +2956,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3086
2956
|
logger.warning(msg)
|
|
3087
2957
|
df[FeaturesEnricher.CURRENT_DATE] = datetime.date.today()
|
|
3088
2958
|
search_keys[FeaturesEnricher.CURRENT_DATE] = SearchKey.DATE
|
|
3089
|
-
converter = DateTimeSearchKeyConverter(FeaturesEnricher.CURRENT_DATE,
|
|
2959
|
+
converter = DateTimeSearchKeyConverter(FeaturesEnricher.CURRENT_DATE, None, logger, bundle)
|
|
3090
2960
|
df = converter.convert(df)
|
|
3091
2961
|
return df
|
|
3092
2962
|
|
|
@@ -3114,37 +2984,17 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3114
2984
|
if len(cols) == 1:
|
|
3115
2985
|
return cols[0]
|
|
3116
2986
|
|
|
3117
|
-
@staticmethod
|
|
3118
|
-
def _get_ip_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
|
|
3119
|
-
cols = [col for col, t in search_keys.items() if t == SearchKey.IP]
|
|
3120
|
-
if len(cols) > 1:
|
|
3121
|
-
raise Exception("More than one ip column found after unnest")
|
|
3122
|
-
if len(cols) == 1:
|
|
3123
|
-
return cols[0]
|
|
3124
|
-
|
|
3125
2987
|
@staticmethod
|
|
3126
2988
|
def _get_phone_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
|
|
3127
2989
|
for col, t in search_keys.items():
|
|
3128
2990
|
if t == SearchKey.PHONE:
|
|
3129
2991
|
return col
|
|
3130
2992
|
|
|
3131
|
-
@staticmethod
|
|
3132
|
-
def _get_country_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
|
|
3133
|
-
for col, t in search_keys.items():
|
|
3134
|
-
if t == SearchKey.COUNTRY:
|
|
3135
|
-
return col
|
|
3136
|
-
|
|
3137
|
-
@staticmethod
|
|
3138
|
-
def _get_postal_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
|
|
3139
|
-
for col, t in search_keys.items():
|
|
3140
|
-
if t == SearchKey.POSTAL_CODE:
|
|
3141
|
-
return col
|
|
3142
|
-
|
|
3143
2993
|
def _explode_multiple_search_keys(
|
|
3144
|
-
self, df: pd.DataFrame, search_keys: Dict[str, SearchKey]
|
|
2994
|
+
self, df: pd.DataFrame, search_keys: Dict[str, SearchKey]
|
|
3145
2995
|
) -> Tuple[pd.DataFrame, Dict[str, List[str]]]:
|
|
3146
2996
|
# find groups of multiple search keys
|
|
3147
|
-
search_key_names_by_type: Dict[SearchKey,
|
|
2997
|
+
search_key_names_by_type: Dict[SearchKey, str] = {}
|
|
3148
2998
|
for key_name, key_type in search_keys.items():
|
|
3149
2999
|
search_key_names_by_type[key_type] = search_key_names_by_type.get(key_type, []) + [key_name]
|
|
3150
3000
|
search_key_names_by_type = {
|
|
@@ -3168,7 +3018,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3168
3018
|
del search_keys[old_key]
|
|
3169
3019
|
search_keys[new_search_key] = key_type
|
|
3170
3020
|
unnest_search_keys[new_search_key] = key_names
|
|
3171
|
-
columns_renaming[new_search_key] = new_search_key
|
|
3172
3021
|
|
|
3173
3022
|
df = pd.concat(exploded_dfs, ignore_index=True)
|
|
3174
3023
|
return df, unnest_search_keys
|
|
@@ -3176,7 +3025,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3176
3025
|
def __add_fit_system_record_id(
|
|
3177
3026
|
self,
|
|
3178
3027
|
df: pd.DataFrame,
|
|
3179
|
-
|
|
3028
|
+
meaning_types: Dict[str, FileColumnMeaningType],
|
|
3180
3029
|
search_keys: Dict[str, SearchKey],
|
|
3181
3030
|
id_name: str,
|
|
3182
3031
|
) -> pd.DataFrame:
|
|
@@ -3199,9 +3048,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3199
3048
|
]
|
|
3200
3049
|
if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
|
|
3201
3050
|
date_column = DateTimeSearchKeyConverter.DATETIME_COL
|
|
3202
|
-
sort_exclude_columns.append(
|
|
3051
|
+
sort_exclude_columns.append(self._get_date_column(search_keys))
|
|
3203
3052
|
else:
|
|
3204
|
-
date_column =
|
|
3053
|
+
date_column = self._get_date_column(search_keys)
|
|
3205
3054
|
sort_columns = [date_column] if date_column is not None else []
|
|
3206
3055
|
|
|
3207
3056
|
other_columns = sorted(
|
|
@@ -3210,6 +3059,13 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3210
3059
|
for c in df.columns
|
|
3211
3060
|
if c not in sort_columns and c not in sort_exclude_columns and df[c].nunique() > 1
|
|
3212
3061
|
]
|
|
3062
|
+
# [
|
|
3063
|
+
# sk
|
|
3064
|
+
# for sk, key_type in search_keys.items()
|
|
3065
|
+
# if key_type not in [SearchKey.DATE, SearchKey.DATETIME]
|
|
3066
|
+
# and sk in df.columns
|
|
3067
|
+
# and df[sk].nunique() > 1 # don't use constant keys for hash
|
|
3068
|
+
# ]
|
|
3213
3069
|
)
|
|
3214
3070
|
|
|
3215
3071
|
search_keys_hash = "search_keys_hash"
|
|
@@ -3222,6 +3078,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3222
3078
|
if search_keys_hash in df.columns:
|
|
3223
3079
|
df.drop(columns=search_keys_hash, inplace=True)
|
|
3224
3080
|
|
|
3081
|
+
if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
|
|
3082
|
+
df.drop(columns=DateTimeSearchKeyConverter.DATETIME_COL, inplace=True)
|
|
3083
|
+
|
|
3225
3084
|
df = df.reset_index(drop=True).reset_index()
|
|
3226
3085
|
# system_record_id saves correct order for fit
|
|
3227
3086
|
df = df.rename(columns={DEFAULT_INDEX: id_name})
|
|
@@ -3231,11 +3090,11 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3231
3090
|
df.index.name = original_index_name
|
|
3232
3091
|
df = df.sort_values(by=original_order_name).drop(columns=original_order_name)
|
|
3233
3092
|
|
|
3234
|
-
|
|
3235
|
-
|
|
3236
|
-
|
|
3237
|
-
|
|
3238
|
-
|
|
3093
|
+
meaning_types[id_name] = (
|
|
3094
|
+
FileColumnMeaningType.SYSTEM_RECORD_ID
|
|
3095
|
+
if id_name == SYSTEM_RECORD_ID
|
|
3096
|
+
else FileColumnMeaningType.ENTITY_SYSTEM_RECORD_ID
|
|
3097
|
+
)
|
|
3239
3098
|
return df
|
|
3240
3099
|
|
|
3241
3100
|
def __correct_target(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
@@ -3613,13 +3472,10 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3613
3472
|
for _, key_type in search_keys.items():
|
|
3614
3473
|
if not isinstance(key_type, SearchKey):
|
|
3615
3474
|
raise ValidationError(self.bundle.get("unsupported_type_of_search_key").format(key_type))
|
|
3616
|
-
|
|
3617
3475
|
valid_search_keys = {}
|
|
3618
3476
|
unsupported_search_keys = {
|
|
3619
3477
|
SearchKey.IP_RANGE_FROM,
|
|
3620
3478
|
SearchKey.IP_RANGE_TO,
|
|
3621
|
-
SearchKey.IPV6_RANGE_FROM,
|
|
3622
|
-
SearchKey.IPV6_RANGE_TO,
|
|
3623
3479
|
SearchKey.MSISDN_RANGE_FROM,
|
|
3624
3480
|
SearchKey.MSISDN_RANGE_TO,
|
|
3625
3481
|
# SearchKey.EMAIL_ONE_DOMAIN,
|
|
@@ -3709,7 +3565,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3709
3565
|
print(msg)
|
|
3710
3566
|
self.logger.warning(msg)
|
|
3711
3567
|
self.warning_counter.increment()
|
|
3712
|
-
# TODO maybe raise ValidationError
|
|
3713
3568
|
|
|
3714
3569
|
self.logger.info(f"Prepared search keys: {valid_search_keys}")
|
|
3715
3570
|
|