upgini 1.1.314a3511.dev3__py3-none-any.whl → 1.1.315a1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- upgini/__about__.py +1 -1
- upgini/dataset.py +33 -386
- upgini/features_enricher.py +295 -145
- upgini/metadata.py +16 -1
- upgini/normalizer/normalize_utils.py +203 -0
- upgini/utils/country_utils.py +16 -0
- upgini/utils/datetime_utils.py +38 -16
- upgini/utils/email_utils.py +49 -17
- upgini/utils/ip_utils.py +100 -1
- upgini/utils/phone_utils.py +345 -0
- upgini/utils/postal_code_utils.py +34 -0
- {upgini-1.1.314a3511.dev3.dist-info → upgini-1.1.315a1.dist-info}/METADATA +1 -1
- {upgini-1.1.314a3511.dev3.dist-info → upgini-1.1.315a1.dist-info}/RECORD +15 -15
- {upgini-1.1.314a3511.dev3.dist-info → upgini-1.1.315a1.dist-info}/WHEEL +1 -1
- upgini/normalizer/phone_normalizer.py +0 -340
- {upgini-1.1.314a3511.dev3.dist-info → upgini-1.1.315a1.dist-info}/licenses/LICENSE +0 -0
upgini/features_enricher.py
CHANGED
|
@@ -61,11 +61,15 @@ from upgini.metadata import (
|
|
|
61
61
|
SearchKey,
|
|
62
62
|
)
|
|
63
63
|
from upgini.metrics import EstimatorWrapper, validate_scoring_argument
|
|
64
|
+
from upgini.normalizer.normalize_utils import Normalizer
|
|
64
65
|
from upgini.resource_bundle import ResourceBundle, bundle, get_custom_bundle
|
|
65
66
|
from upgini.search_task import SearchTask
|
|
66
67
|
from upgini.spinner import Spinner
|
|
67
68
|
from upgini.utils import combine_search_keys, find_numbers_with_decimal_comma
|
|
68
|
-
from upgini.utils.country_utils import
|
|
69
|
+
from upgini.utils.country_utils import (
|
|
70
|
+
CountrySearchKeyConverter,
|
|
71
|
+
CountrySearchKeyDetector,
|
|
72
|
+
)
|
|
69
73
|
from upgini.utils.custom_loss_utils import (
|
|
70
74
|
get_additional_params_custom_loss,
|
|
71
75
|
get_runtime_params_custom_loss,
|
|
@@ -87,11 +91,19 @@ from upgini.utils.display_utils import (
|
|
|
87
91
|
prepare_and_show_report,
|
|
88
92
|
show_request_quote_button,
|
|
89
93
|
)
|
|
90
|
-
from upgini.utils.email_utils import
|
|
94
|
+
from upgini.utils.email_utils import (
|
|
95
|
+
EmailDomainGenerator,
|
|
96
|
+
EmailSearchKeyConverter,
|
|
97
|
+
EmailSearchKeyDetector,
|
|
98
|
+
)
|
|
91
99
|
from upgini.utils.features_validator import FeaturesValidator
|
|
92
100
|
from upgini.utils.format import Format
|
|
93
|
-
from upgini.utils.
|
|
94
|
-
from upgini.utils.
|
|
101
|
+
from upgini.utils.ip_utils import IpSearchKeyConverter
|
|
102
|
+
from upgini.utils.phone_utils import PhoneSearchKeyConverter, PhoneSearchKeyDetector
|
|
103
|
+
from upgini.utils.postal_code_utils import (
|
|
104
|
+
PostalCodeSearchKeyConverter,
|
|
105
|
+
PostalCodeSearchKeyDetector,
|
|
106
|
+
)
|
|
95
107
|
|
|
96
108
|
try:
|
|
97
109
|
from upgini.utils.progress_bar import CustomProgressBar as ProgressBar
|
|
@@ -237,6 +249,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
237
249
|
|
|
238
250
|
self.passed_features: List[str] = []
|
|
239
251
|
self.df_with_original_index: Optional[pd.DataFrame] = None
|
|
252
|
+
self.fit_columns_renaming: Optional[Dict[str, str]] = None
|
|
240
253
|
self.country_added = False
|
|
241
254
|
self.fit_generated_features: List[str] = []
|
|
242
255
|
self.fit_dropped_features: Set[str] = set()
|
|
@@ -247,7 +260,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
247
260
|
self.eval_set: Optional[List[Tuple]] = None
|
|
248
261
|
self.autodetected_search_keys: Dict[str, SearchKey] = {}
|
|
249
262
|
self.imbalanced = False
|
|
250
|
-
self.__cached_sampled_datasets: Optional[Tuple[pd.DataFrame, pd.DataFrame, pd.Series, Dict, Dict]] = None
|
|
263
|
+
self.__cached_sampled_datasets: Optional[Tuple[pd.DataFrame, pd.DataFrame, pd.Series, Dict, Dict, Dict]] = None
|
|
251
264
|
|
|
252
265
|
validate_version(self.logger)
|
|
253
266
|
self.search_keys = search_keys or {}
|
|
@@ -706,7 +719,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
706
719
|
|
|
707
720
|
start_time = time.time()
|
|
708
721
|
try:
|
|
709
|
-
result = self.__inner_transform(
|
|
722
|
+
result, _ = self.__inner_transform(
|
|
710
723
|
trace_id,
|
|
711
724
|
X,
|
|
712
725
|
exclude_features_sources=exclude_features_sources,
|
|
@@ -906,8 +919,16 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
906
919
|
search_keys,
|
|
907
920
|
groups,
|
|
908
921
|
_cv,
|
|
922
|
+
columns_renaming,
|
|
909
923
|
) = prepared_data
|
|
910
924
|
|
|
925
|
+
# rename cat_features
|
|
926
|
+
if cat_features:
|
|
927
|
+
for new_c, old_c in columns_renaming.items():
|
|
928
|
+
if old_c in cat_features:
|
|
929
|
+
cat_features.remove(old_c)
|
|
930
|
+
cat_features.append(new_c)
|
|
931
|
+
|
|
911
932
|
gc.collect()
|
|
912
933
|
|
|
913
934
|
print(self.bundle.get("metrics_start"))
|
|
@@ -920,7 +941,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
920
941
|
|
|
921
942
|
self._check_train_and_eval_target_distribution(y_sorted, fitting_eval_set_dict)
|
|
922
943
|
|
|
923
|
-
has_date =
|
|
944
|
+
has_date = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME]) is not None
|
|
924
945
|
model_task_type = self.model_task_type or define_task(y_sorted, has_date, self.logger, silent=True)
|
|
925
946
|
|
|
926
947
|
wrapper = EstimatorWrapper.create(
|
|
@@ -1013,7 +1034,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1013
1034
|
self.bundle.get("quality_metrics_rows_header"): _num_samples(effective_X),
|
|
1014
1035
|
}
|
|
1015
1036
|
if model_task_type in [ModelTaskType.BINARY, ModelTaskType.REGRESSION] and is_numeric_dtype(
|
|
1016
|
-
|
|
1037
|
+
effective_y
|
|
1017
1038
|
):
|
|
1018
1039
|
train_metrics[self.bundle.get("quality_metrics_mean_target_header")] = round(
|
|
1019
1040
|
np.mean(effective_y), 4
|
|
@@ -1086,7 +1107,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1086
1107
|
# self.bundle.get("quality_metrics_match_rate_header"): eval_hit_rate,
|
|
1087
1108
|
}
|
|
1088
1109
|
if model_task_type in [ModelTaskType.BINARY, ModelTaskType.REGRESSION] and is_numeric_dtype(
|
|
1089
|
-
|
|
1110
|
+
effective_eval_set[idx][1]
|
|
1090
1111
|
):
|
|
1091
1112
|
eval_metrics[self.bundle.get("quality_metrics_mean_target_header")] = round(
|
|
1092
1113
|
np.mean(effective_eval_set[idx][1]), 4
|
|
@@ -1113,7 +1134,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1113
1134
|
)
|
|
1114
1135
|
|
|
1115
1136
|
uplift_col = self.bundle.get("quality_metrics_uplift_header")
|
|
1116
|
-
date_column =
|
|
1137
|
+
date_column = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
|
|
1117
1138
|
if (
|
|
1118
1139
|
uplift_col in metrics_df.columns
|
|
1119
1140
|
and (metrics_df[uplift_col] < 0).any()
|
|
@@ -1195,27 +1216,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1195
1216
|
def _has_paid_features(self, exclude_features_sources: Optional[List[str]]) -> bool:
|
|
1196
1217
|
return self._has_features_with_commercial_schema(CommercialSchema.PAID.value, exclude_features_sources)
|
|
1197
1218
|
|
|
1198
|
-
def _extend_x(self, x: pd.DataFrame, is_demo_dataset: bool) -> Tuple[pd.DataFrame, Dict[str, SearchKey]]:
|
|
1199
|
-
search_keys = self.search_keys.copy()
|
|
1200
|
-
search_keys = self.__prepare_search_keys(x, search_keys, is_demo_dataset, is_transform=True, silent_mode=True)
|
|
1201
|
-
|
|
1202
|
-
extended_X = x.copy()
|
|
1203
|
-
generated_features = []
|
|
1204
|
-
date_column = self._get_date_column(search_keys)
|
|
1205
|
-
if date_column is not None:
|
|
1206
|
-
converter = DateTimeSearchKeyConverter(date_column, self.date_format, self.logger, self.bundle)
|
|
1207
|
-
extended_X = converter.convert(extended_X, keep_time=True)
|
|
1208
|
-
generated_features.extend(converter.generated_features)
|
|
1209
|
-
email_column = self._get_email_column(search_keys)
|
|
1210
|
-
hem_column = self._get_hem_column(search_keys)
|
|
1211
|
-
if email_column:
|
|
1212
|
-
converter = EmailSearchKeyConverter(email_column, hem_column, search_keys, [], self.logger)
|
|
1213
|
-
extended_X = converter.convert(extended_X)
|
|
1214
|
-
generated_features.extend(converter.generated_features)
|
|
1215
|
-
generated_features = [f for f in generated_features if f in self.fit_generated_features]
|
|
1216
|
-
|
|
1217
|
-
return extended_X, search_keys
|
|
1218
|
-
|
|
1219
1219
|
def _is_input_same_as_fit(
|
|
1220
1220
|
self,
|
|
1221
1221
|
X: Union[pd.DataFrame, pd.Series, np.ndarray, None] = None,
|
|
@@ -1259,7 +1259,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1259
1259
|
groups = None
|
|
1260
1260
|
|
|
1261
1261
|
if not isinstance(_cv, BaseCrossValidator):
|
|
1262
|
-
date_column =
|
|
1262
|
+
date_column = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
|
|
1263
1263
|
date_series = X[date_column] if date_column is not None else None
|
|
1264
1264
|
_cv, groups = CVConfig(
|
|
1265
1265
|
_cv, date_series, self.random_state, self._search_task.get_shuffle_kfold(), group_columns=group_columns
|
|
@@ -1282,7 +1282,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1282
1282
|
|
|
1283
1283
|
def _get_client_cat_features(
|
|
1284
1284
|
self, estimator: Optional[Any], X: pd.DataFrame, search_keys: Dict[str, SearchKey]
|
|
1285
|
-
) -> Optional[List[str]]:
|
|
1285
|
+
) -> Tuple[Optional[List[str]], List[str]]:
|
|
1286
1286
|
cat_features = None
|
|
1287
1287
|
search_keys_for_metrics = []
|
|
1288
1288
|
if (
|
|
@@ -1342,11 +1342,15 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1342
1342
|
progress_bar,
|
|
1343
1343
|
progress_callback,
|
|
1344
1344
|
)
|
|
1345
|
-
X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys = dataclasses.astuple(
|
|
1345
|
+
X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys, columns_renaming = dataclasses.astuple(
|
|
1346
|
+
sampled_data
|
|
1347
|
+
)
|
|
1346
1348
|
|
|
1347
1349
|
excluding_search_keys = list(search_keys.keys())
|
|
1348
1350
|
if search_keys_for_metrics is not None and len(search_keys_for_metrics) > 0:
|
|
1349
|
-
|
|
1351
|
+
for sk in excluding_search_keys:
|
|
1352
|
+
if columns_renaming.get(sk) in search_keys_for_metrics:
|
|
1353
|
+
excluding_search_keys.remove(sk)
|
|
1350
1354
|
|
|
1351
1355
|
client_features = [
|
|
1352
1356
|
c
|
|
@@ -1363,6 +1367,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1363
1367
|
importance_threshold,
|
|
1364
1368
|
max_features,
|
|
1365
1369
|
)
|
|
1370
|
+
filtered_enriched_features = [c for c in filtered_enriched_features if c not in client_features]
|
|
1366
1371
|
|
|
1367
1372
|
X_sorted, y_sorted = self._sort_by_system_record_id(X_sampled, y_sampled, self.cv)
|
|
1368
1373
|
enriched_X_sorted, enriched_y_sorted = self._sort_by_system_record_id(enriched_X, y_sampled, self.cv)
|
|
@@ -1392,6 +1397,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1392
1397
|
fitting_X = fitting_X.drop(columns=constant_columns, errors="ignore")
|
|
1393
1398
|
fitting_enriched_X = fitting_enriched_X.drop(columns=constant_columns, errors="ignore")
|
|
1394
1399
|
|
|
1400
|
+
# TODO maybe there is no more need for these convertions
|
|
1395
1401
|
# Remove datetime features
|
|
1396
1402
|
datetime_features = [
|
|
1397
1403
|
f for f in fitting_X.columns if is_datetime64_any_dtype(fitting_X[f]) or is_period_dtype(fitting_X[f])
|
|
@@ -1479,6 +1485,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1479
1485
|
search_keys,
|
|
1480
1486
|
groups,
|
|
1481
1487
|
cv,
|
|
1488
|
+
columns_renaming,
|
|
1482
1489
|
)
|
|
1483
1490
|
|
|
1484
1491
|
@dataclass
|
|
@@ -1488,6 +1495,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1488
1495
|
enriched_X: pd.DataFrame
|
|
1489
1496
|
eval_set_sampled_dict: Dict[int, Tuple[pd.DataFrame, pd.Series]]
|
|
1490
1497
|
search_keys: Dict[str, SearchKey]
|
|
1498
|
+
columns_renaming: Dict[str, str]
|
|
1491
1499
|
|
|
1492
1500
|
def _sample_data_for_metrics(
|
|
1493
1501
|
self,
|
|
@@ -1527,11 +1535,15 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1527
1535
|
)
|
|
1528
1536
|
|
|
1529
1537
|
def __get_sampled_cached_enriched(self, exclude_features_sources: Optional[List[str]]) -> _SampledDataForMetrics:
|
|
1530
|
-
X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys =
|
|
1538
|
+
X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys, columns_renaming = (
|
|
1539
|
+
self.__cached_sampled_datasets
|
|
1540
|
+
)
|
|
1531
1541
|
if exclude_features_sources:
|
|
1532
1542
|
enriched_X = enriched_X.drop(columns=exclude_features_sources, errors="ignore")
|
|
1533
1543
|
|
|
1534
|
-
return self.__mk_sampled_data_tuple(
|
|
1544
|
+
return self.__mk_sampled_data_tuple(
|
|
1545
|
+
X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys, columns_renaming
|
|
1546
|
+
)
|
|
1535
1547
|
|
|
1536
1548
|
def __sample_only_input(
|
|
1537
1549
|
self, validated_X: pd.DataFrame, validated_y: pd.Series, eval_set: Optional[List[tuple]], is_demo_dataset: bool
|
|
@@ -1549,6 +1561,28 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1549
1561
|
eval_xy[EVAL_SET_INDEX] = idx + 1
|
|
1550
1562
|
df = pd.concat([df, eval_xy])
|
|
1551
1563
|
|
|
1564
|
+
search_keys = self.search_keys.copy()
|
|
1565
|
+
search_keys = self.__prepare_search_keys(df, search_keys, is_demo_dataset, is_transform=True, silent_mode=True)
|
|
1566
|
+
|
|
1567
|
+
date_column = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
|
|
1568
|
+
generated_features = []
|
|
1569
|
+
if date_column is not None:
|
|
1570
|
+
converter = DateTimeSearchKeyConverter(
|
|
1571
|
+
date_column, self.date_format, self.logger, self.bundle, silent_mode=True
|
|
1572
|
+
)
|
|
1573
|
+
df = converter.convert(df, keep_time=True)
|
|
1574
|
+
generated_features = converter.generated_features
|
|
1575
|
+
|
|
1576
|
+
email_columns = SearchKey.find_all_keys(search_keys, SearchKey.EMAIL)
|
|
1577
|
+
if email_columns:
|
|
1578
|
+
generator = EmailDomainGenerator(email_columns)
|
|
1579
|
+
df = generator.generate(df)
|
|
1580
|
+
generated_features.extend(generator.generated_features)
|
|
1581
|
+
|
|
1582
|
+
normalizer = Normalizer(self.search_keys, generated_features, self.bundle, self.logger, self.warning_counter)
|
|
1583
|
+
df = normalizer.normalize(df)
|
|
1584
|
+
columns_renaming = normalizer.columns_renaming
|
|
1585
|
+
|
|
1552
1586
|
df = clean_full_duplicates(df, logger=self.logger, silent=True, bundle=self.bundle)
|
|
1553
1587
|
|
|
1554
1588
|
num_samples = _num_samples(df)
|
|
@@ -1561,24 +1595,34 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1561
1595
|
self.logger.info(f"Downsampling from {num_samples} to {sample_rows}")
|
|
1562
1596
|
df = df.sample(n=sample_rows, random_state=self.random_state)
|
|
1563
1597
|
|
|
1564
|
-
|
|
1565
|
-
|
|
1598
|
+
df = self.__add_fit_system_record_id(df, search_keys, SYSTEM_RECORD_ID)
|
|
1599
|
+
if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
|
|
1600
|
+
df = df.drop(columns=DateTimeSearchKeyConverter.DATETIME_COL)
|
|
1566
1601
|
|
|
1567
|
-
train_df =
|
|
1602
|
+
train_df = df.query(f"{EVAL_SET_INDEX} == 0") if eval_set is not None else df
|
|
1568
1603
|
X_sampled = train_df.drop(columns=[TARGET, EVAL_SET_INDEX], errors="ignore")
|
|
1569
1604
|
y_sampled = train_df[TARGET].copy()
|
|
1570
1605
|
enriched_X = X_sampled
|
|
1571
1606
|
|
|
1572
1607
|
if eval_set is not None:
|
|
1573
1608
|
for idx in range(len(eval_set)):
|
|
1574
|
-
eval_xy_sampled =
|
|
1609
|
+
eval_xy_sampled = df.query(f"{EVAL_SET_INDEX} == {idx + 1}")
|
|
1575
1610
|
eval_X_sampled = eval_xy_sampled.drop(columns=[TARGET, EVAL_SET_INDEX], errors="ignore")
|
|
1576
1611
|
eval_y_sampled = eval_xy_sampled[TARGET].copy()
|
|
1577
1612
|
enriched_eval_X = eval_X_sampled
|
|
1578
1613
|
eval_set_sampled_dict[idx] = (eval_X_sampled, enriched_eval_X, eval_y_sampled)
|
|
1579
|
-
self.__cached_sampled_datasets = (
|
|
1614
|
+
self.__cached_sampled_datasets = (
|
|
1615
|
+
X_sampled,
|
|
1616
|
+
y_sampled,
|
|
1617
|
+
enriched_X,
|
|
1618
|
+
eval_set_sampled_dict,
|
|
1619
|
+
search_keys,
|
|
1620
|
+
columns_renaming,
|
|
1621
|
+
)
|
|
1580
1622
|
|
|
1581
|
-
return self.__mk_sampled_data_tuple(
|
|
1623
|
+
return self.__mk_sampled_data_tuple(
|
|
1624
|
+
X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys, columns_renaming
|
|
1625
|
+
)
|
|
1582
1626
|
|
|
1583
1627
|
def __sample_balanced(
|
|
1584
1628
|
self,
|
|
@@ -1590,7 +1634,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1590
1634
|
search_keys = self.fit_search_keys
|
|
1591
1635
|
|
|
1592
1636
|
rows_to_drop = None
|
|
1593
|
-
has_date =
|
|
1637
|
+
has_date = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME]) is not None
|
|
1594
1638
|
task_type = self.model_task_type or define_task(
|
|
1595
1639
|
self.df_with_original_index[TARGET], has_date, self.logger, silent=True
|
|
1596
1640
|
)
|
|
@@ -1644,9 +1688,18 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1644
1688
|
enriched_eval_X = enriched_eval_sets[idx + 1].drop(columns=[TARGET, EVAL_SET_INDEX])
|
|
1645
1689
|
eval_set_sampled_dict[idx] = (eval_X_sampled, enriched_eval_X, eval_y_sampled)
|
|
1646
1690
|
|
|
1647
|
-
self.__cached_sampled_datasets = (
|
|
1691
|
+
self.__cached_sampled_datasets = (
|
|
1692
|
+
X_sampled,
|
|
1693
|
+
y_sampled,
|
|
1694
|
+
enriched_X,
|
|
1695
|
+
eval_set_sampled_dict,
|
|
1696
|
+
search_keys,
|
|
1697
|
+
self.fit_columns_renaming,
|
|
1698
|
+
)
|
|
1648
1699
|
|
|
1649
|
-
return self.__mk_sampled_data_tuple(
|
|
1700
|
+
return self.__mk_sampled_data_tuple(
|
|
1701
|
+
X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys, self.fit_columns_renaming
|
|
1702
|
+
)
|
|
1650
1703
|
|
|
1651
1704
|
def __sample_imbalanced(
|
|
1652
1705
|
self,
|
|
@@ -1686,7 +1739,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1686
1739
|
tmp_target_name = "__target"
|
|
1687
1740
|
df = df.rename(columns={TARGET: tmp_target_name})
|
|
1688
1741
|
|
|
1689
|
-
enriched_df = self.__inner_transform(
|
|
1742
|
+
enriched_df, columns_renaming = self.__inner_transform(
|
|
1690
1743
|
trace_id,
|
|
1691
1744
|
df,
|
|
1692
1745
|
exclude_features_sources=exclude_features_sources,
|
|
@@ -1734,7 +1787,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1734
1787
|
tmp_target_name = "__target"
|
|
1735
1788
|
df = df.rename(columns={TARGET: tmp_target_name})
|
|
1736
1789
|
|
|
1737
|
-
enriched_Xy = self.__inner_transform(
|
|
1790
|
+
enriched_Xy, columns_renaming = self.__inner_transform(
|
|
1738
1791
|
trace_id,
|
|
1739
1792
|
df,
|
|
1740
1793
|
exclude_features_sources=exclude_features_sources,
|
|
@@ -1759,9 +1812,18 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1759
1812
|
y_sampled = enriched_Xy[TARGET].copy()
|
|
1760
1813
|
enriched_X = enriched_Xy.drop(columns=TARGET)
|
|
1761
1814
|
|
|
1762
|
-
self.__cached_sampled_datasets = (
|
|
1815
|
+
self.__cached_sampled_datasets = (
|
|
1816
|
+
X_sampled,
|
|
1817
|
+
y_sampled,
|
|
1818
|
+
enriched_X,
|
|
1819
|
+
eval_set_sampled_dict,
|
|
1820
|
+
self.search_keys,
|
|
1821
|
+
columns_renaming,
|
|
1822
|
+
)
|
|
1763
1823
|
|
|
1764
|
-
return self.__mk_sampled_data_tuple(
|
|
1824
|
+
return self.__mk_sampled_data_tuple(
|
|
1825
|
+
X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, self.search_keys, columns_renaming
|
|
1826
|
+
)
|
|
1765
1827
|
|
|
1766
1828
|
def __mk_sampled_data_tuple(
|
|
1767
1829
|
self,
|
|
@@ -1770,6 +1832,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1770
1832
|
enriched_X: pd.DataFrame,
|
|
1771
1833
|
eval_set_sampled_dict: Dict,
|
|
1772
1834
|
search_keys: Dict,
|
|
1835
|
+
columns_renaming: Dict[str, str],
|
|
1773
1836
|
):
|
|
1774
1837
|
search_keys = {k: v for k, v in search_keys.items() if k in X_sampled.columns.to_list()}
|
|
1775
1838
|
return FeaturesEnricher._SampledDataForMetrics(
|
|
@@ -1778,6 +1841,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1778
1841
|
enriched_X=enriched_X,
|
|
1779
1842
|
eval_set_sampled_dict=eval_set_sampled_dict,
|
|
1780
1843
|
search_keys=search_keys,
|
|
1844
|
+
columns_renaming=columns_renaming,
|
|
1781
1845
|
)
|
|
1782
1846
|
|
|
1783
1847
|
def get_search_id(self) -> Optional[str]:
|
|
@@ -1866,7 +1930,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1866
1930
|
progress_bar: Optional[ProgressBar] = None,
|
|
1867
1931
|
progress_callback: Optional[Callable[[SearchProgress], Any]] = None,
|
|
1868
1932
|
add_fit_system_record_id: bool = False,
|
|
1869
|
-
) -> pd.DataFrame:
|
|
1933
|
+
) -> Tuple[pd.DataFrame, Dict[str, str]]:
|
|
1870
1934
|
if self._search_task is None:
|
|
1871
1935
|
raise NotFittedError(self.bundle.get("transform_unfitted_enricher"))
|
|
1872
1936
|
|
|
@@ -1879,13 +1943,13 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1879
1943
|
|
|
1880
1944
|
if len(self.feature_names_) == 0:
|
|
1881
1945
|
self.logger.warning(self.bundle.get("no_important_features_for_transform"))
|
|
1882
|
-
return X
|
|
1946
|
+
return X, {c: c for c in X.columns}
|
|
1883
1947
|
|
|
1884
1948
|
if self._has_paid_features(exclude_features_sources):
|
|
1885
1949
|
msg = self.bundle.get("transform_with_paid_features")
|
|
1886
1950
|
self.logger.warning(msg)
|
|
1887
1951
|
self.__display_support_link(msg)
|
|
1888
|
-
return None
|
|
1952
|
+
return None, {c: c for c in X.columns}
|
|
1889
1953
|
|
|
1890
1954
|
if not metrics_calculation:
|
|
1891
1955
|
transform_usage = self.rest_client.get_current_transform_usage(trace_id)
|
|
@@ -1896,7 +1960,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1896
1960
|
self.logger.warning(msg)
|
|
1897
1961
|
print(msg)
|
|
1898
1962
|
show_request_quote_button()
|
|
1899
|
-
return None
|
|
1963
|
+
return None, {c: c for c in X.columns}
|
|
1900
1964
|
else:
|
|
1901
1965
|
msg = self.bundle.get("transform_usage_info").format(
|
|
1902
1966
|
transform_usage.limit, transform_usage.transformed_rows
|
|
@@ -1934,9 +1998,11 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1934
1998
|
df = self.__add_country_code(df, search_keys)
|
|
1935
1999
|
|
|
1936
2000
|
generated_features = []
|
|
1937
|
-
date_column =
|
|
2001
|
+
date_column = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
|
|
1938
2002
|
if date_column is not None:
|
|
1939
|
-
converter = DateTimeSearchKeyConverter(
|
|
2003
|
+
converter = DateTimeSearchKeyConverter(
|
|
2004
|
+
date_column, self.date_format, self.logger, bundle=self.bundle, silent_mode=silent_mode
|
|
2005
|
+
)
|
|
1940
2006
|
df = converter.convert(df)
|
|
1941
2007
|
self.logger.info(f"Date column after convertion: {df[date_column]}")
|
|
1942
2008
|
generated_features.extend(converter.generated_features)
|
|
@@ -1945,61 +2011,93 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1945
2011
|
if self.add_date_if_missing:
|
|
1946
2012
|
df = self._add_current_date_as_key(df, search_keys, self.logger, self.bundle)
|
|
1947
2013
|
|
|
2014
|
+
email_columns = SearchKey.find_all_keys(search_keys, SearchKey.EMAIL)
|
|
2015
|
+
if email_columns:
|
|
2016
|
+
generator = EmailDomainGenerator(email_columns)
|
|
2017
|
+
df = generator.generate(df)
|
|
2018
|
+
generated_features.extend(generator.generated_features)
|
|
2019
|
+
|
|
2020
|
+
normalizer = Normalizer(
|
|
2021
|
+
search_keys, generated_features, self.bundle, self.logger, self.warning_counter, silent_mode
|
|
2022
|
+
)
|
|
2023
|
+
df = normalizer.normalize(df)
|
|
2024
|
+
columns_renaming = normalizer.columns_renaming
|
|
2025
|
+
|
|
1948
2026
|
# Don't pass all features in backend on transform
|
|
1949
|
-
original_features_for_transform = []
|
|
1950
2027
|
runtime_parameters = self._get_copy_of_runtime_parameters()
|
|
1951
|
-
|
|
1952
|
-
if len(
|
|
1953
|
-
|
|
1954
|
-
features_for_transform = self._search_task.get_features_for_transform()
|
|
1955
|
-
if features_for_transform is not None and len(features_for_transform) > 0:
|
|
1956
|
-
file_metadata = self._search_task.get_file_metadata(trace_id)
|
|
1957
|
-
original_features_for_transform = [
|
|
1958
|
-
c.originalName or c.name for c in file_metadata.columns if c.name in features_for_transform
|
|
1959
|
-
]
|
|
1960
|
-
|
|
1961
|
-
runtime_parameters.properties["features_for_embeddings"] = ",".join(features_for_transform)
|
|
2028
|
+
features_for_transform = self._search_task.get_features_for_transform() or []
|
|
2029
|
+
if len(features_for_transform) > 0:
|
|
2030
|
+
runtime_parameters.properties["features_for_embeddings"] = ",".join(features_for_transform)
|
|
1962
2031
|
|
|
1963
|
-
columns_for_system_record_id = sorted(list(search_keys.keys()) +
|
|
2032
|
+
columns_for_system_record_id = sorted(list(search_keys.keys()) + features_for_transform)
|
|
1964
2033
|
|
|
1965
2034
|
df[ENTITY_SYSTEM_RECORD_ID] = pd.util.hash_pandas_object(
|
|
1966
2035
|
df[columns_for_system_record_id], index=False
|
|
1967
2036
|
).astype("Float64")
|
|
1968
2037
|
|
|
1969
2038
|
# Explode multiple search keys
|
|
1970
|
-
df, unnest_search_keys = self._explode_multiple_search_keys(df, search_keys)
|
|
2039
|
+
df, unnest_search_keys = self._explode_multiple_search_keys(df, search_keys, columns_renaming)
|
|
1971
2040
|
|
|
1972
2041
|
email_column = self._get_email_column(search_keys)
|
|
1973
2042
|
hem_column = self._get_hem_column(search_keys)
|
|
1974
|
-
email_converted_to_hem = False
|
|
1975
2043
|
if email_column:
|
|
1976
2044
|
converter = EmailSearchKeyConverter(
|
|
1977
|
-
email_column,
|
|
2045
|
+
email_column,
|
|
2046
|
+
hem_column,
|
|
2047
|
+
search_keys,
|
|
2048
|
+
columns_renaming,
|
|
2049
|
+
list(unnest_search_keys.keys()),
|
|
2050
|
+
self.logger,
|
|
1978
2051
|
)
|
|
1979
2052
|
df = converter.convert(df)
|
|
1980
|
-
|
|
1981
|
-
|
|
2053
|
+
|
|
2054
|
+
ip_column = self._get_ip_column(search_keys)
|
|
2055
|
+
if ip_column:
|
|
2056
|
+
converter = IpSearchKeyConverter(
|
|
2057
|
+
ip_column,
|
|
2058
|
+
search_keys,
|
|
2059
|
+
columns_renaming,
|
|
2060
|
+
list(unnest_search_keys.keys()),
|
|
2061
|
+
self.bundle,
|
|
2062
|
+
self.logger,
|
|
2063
|
+
)
|
|
2064
|
+
df = converter.convert(df)
|
|
2065
|
+
|
|
2066
|
+
phone_column = self._get_phone_column(search_keys)
|
|
2067
|
+
country_column = self._get_country_column(search_keys)
|
|
2068
|
+
if phone_column:
|
|
2069
|
+
converter = PhoneSearchKeyConverter(phone_column, country_column)
|
|
2070
|
+
df = converter.convert(df)
|
|
2071
|
+
|
|
2072
|
+
if country_column:
|
|
2073
|
+
converter = CountrySearchKeyConverter(country_column)
|
|
2074
|
+
df = converter.convert(df)
|
|
2075
|
+
|
|
2076
|
+
postal_code = self._get_postal_column(search_keys)
|
|
2077
|
+
if postal_code:
|
|
2078
|
+
converter = PostalCodeSearchKeyConverter(postal_code)
|
|
2079
|
+
df = converter.convert(df)
|
|
2080
|
+
|
|
1982
2081
|
generated_features = [f for f in generated_features if f in self.fit_generated_features]
|
|
1983
2082
|
|
|
1984
2083
|
meaning_types = {col: key.value for col, key in search_keys.items()}
|
|
1985
|
-
|
|
1986
|
-
for col in original_features_for_transform:
|
|
2084
|
+
for col in features_for_transform:
|
|
1987
2085
|
meaning_types[col] = FileColumnMeaningType.FEATURE
|
|
1988
|
-
features_not_to_pass = [
|
|
1989
|
-
|
|
1990
|
-
|
|
1991
|
-
|
|
1992
|
-
|
|
1993
|
-
features_not_to_pass = [c for c in features_not_to_pass if c not in original_features_for_transform]
|
|
1994
|
-
columns_for_system_record_id = sorted(list(search_keys.keys()) + (original_features_for_transform))
|
|
2086
|
+
features_not_to_pass = [
|
|
2087
|
+
c
|
|
2088
|
+
for c in df.columns
|
|
2089
|
+
if c not in search_keys.keys() and c not in features_for_transform and c != ENTITY_SYSTEM_RECORD_ID
|
|
2090
|
+
]
|
|
1995
2091
|
|
|
1996
2092
|
if add_fit_system_record_id:
|
|
1997
|
-
df = self.__add_fit_system_record_id(df,
|
|
2093
|
+
df = self.__add_fit_system_record_id(df, search_keys, SYSTEM_RECORD_ID)
|
|
2094
|
+
if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
|
|
2095
|
+
df = df.drop(columns=DateTimeSearchKeyConverter.DATETIME_COL)
|
|
1998
2096
|
df = df.rename(columns={SYSTEM_RECORD_ID: SORT_ID})
|
|
1999
2097
|
features_not_to_pass.append(SORT_ID)
|
|
2000
2098
|
|
|
2001
|
-
|
|
2002
|
-
|
|
2099
|
+
# search keys might be changed after explode
|
|
2100
|
+
columns_for_system_record_id = sorted(list(search_keys.keys()) + features_for_transform)
|
|
2003
2101
|
df[SYSTEM_RECORD_ID] = pd.util.hash_pandas_object(df[columns_for_system_record_id], index=False).astype(
|
|
2004
2102
|
"Float64"
|
|
2005
2103
|
)
|
|
@@ -2035,8 +2133,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2035
2133
|
rest_client=self.rest_client,
|
|
2036
2134
|
logger=self.logger,
|
|
2037
2135
|
)
|
|
2038
|
-
|
|
2039
|
-
dataset.ignore_columns = [email_column]
|
|
2136
|
+
dataset.columns_renaming = columns_renaming
|
|
2040
2137
|
|
|
2041
2138
|
if max_features is not None or importance_threshold is not None:
|
|
2042
2139
|
exclude_features_sources = list(
|
|
@@ -2125,7 +2222,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2125
2222
|
result = enrich()
|
|
2126
2223
|
|
|
2127
2224
|
filtered_columns = self.__filtered_enriched_features(importance_threshold, max_features)
|
|
2128
|
-
existing_filtered_columns = [
|
|
2225
|
+
existing_filtered_columns = [
|
|
2226
|
+
c for c in filtered_columns if c in result.columns and c not in validated_X.columns
|
|
2227
|
+
]
|
|
2129
2228
|
selecting_columns = validated_X.columns.tolist() + generated_features + existing_filtered_columns
|
|
2130
2229
|
if add_fit_system_record_id:
|
|
2131
2230
|
selecting_columns.append(SORT_ID)
|
|
@@ -2138,7 +2237,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2138
2237
|
if add_fit_system_record_id:
|
|
2139
2238
|
result = result.rename(columns={SORT_ID: SYSTEM_RECORD_ID})
|
|
2140
2239
|
|
|
2141
|
-
return result
|
|
2240
|
+
return result, columns_renaming
|
|
2142
2241
|
|
|
2143
2242
|
def _get_excluded_features(self, max_features: Optional[int], importance_threshold: Optional[float]) -> List[str]:
|
|
2144
2243
|
features_info = self._internal_features_info
|
|
@@ -2239,6 +2338,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2239
2338
|
self.df_with_original_index = None
|
|
2240
2339
|
self.__cached_sampled_datasets = None
|
|
2241
2340
|
self.metrics = None
|
|
2341
|
+
self.fit_columns_renaming = None
|
|
2342
|
+
self.fit_dropped_features = set()
|
|
2343
|
+
self.fit_generated_features = []
|
|
2242
2344
|
|
|
2243
2345
|
validated_X = self._validate_X(X)
|
|
2244
2346
|
validated_y = self._validate_y(validated_X, y)
|
|
@@ -2285,9 +2387,10 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2285
2387
|
self.fit_search_keys = self.search_keys.copy()
|
|
2286
2388
|
self.fit_search_keys = self.__prepare_search_keys(validated_X, self.fit_search_keys, is_demo_dataset)
|
|
2287
2389
|
|
|
2288
|
-
maybe_date_column =
|
|
2390
|
+
maybe_date_column = SearchKey.find_key(self.fit_search_keys, [SearchKey.DATE, SearchKey.DATETIME])
|
|
2289
2391
|
has_date = maybe_date_column is not None
|
|
2290
2392
|
model_task_type = self.model_task_type or define_task(validated_y, has_date, self.logger)
|
|
2393
|
+
|
|
2291
2394
|
self._validate_binary_observations(validated_y, model_task_type)
|
|
2292
2395
|
|
|
2293
2396
|
self.runtime_parameters = get_runtime_params_custom_loss(
|
|
@@ -2317,7 +2420,13 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2317
2420
|
self.fit_generated_features = []
|
|
2318
2421
|
|
|
2319
2422
|
if has_date:
|
|
2320
|
-
converter = DateTimeSearchKeyConverter(
|
|
2423
|
+
converter = DateTimeSearchKeyConverter(
|
|
2424
|
+
maybe_date_column,
|
|
2425
|
+
self.date_format,
|
|
2426
|
+
self.logger,
|
|
2427
|
+
bundle=self.bundle,
|
|
2428
|
+
warnings_counter=self.warning_counter,
|
|
2429
|
+
)
|
|
2321
2430
|
df = converter.convert(df, keep_time=True)
|
|
2322
2431
|
self.logger.info(f"Date column after convertion: {df[maybe_date_column]}")
|
|
2323
2432
|
self.fit_generated_features.extend(converter.generated_features)
|
|
@@ -2326,6 +2435,12 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2326
2435
|
if self.add_date_if_missing:
|
|
2327
2436
|
df = self._add_current_date_as_key(df, self.fit_search_keys, self.logger, self.bundle)
|
|
2328
2437
|
|
|
2438
|
+
email_columns = SearchKey.find_all_keys(self.fit_search_keys, SearchKey.EMAIL)
|
|
2439
|
+
if email_columns:
|
|
2440
|
+
generator = EmailDomainGenerator(email_columns)
|
|
2441
|
+
df = generator.generate(df)
|
|
2442
|
+
self.fit_generated_features.extend(generator.generated_features)
|
|
2443
|
+
|
|
2329
2444
|
# Checks that need validated date
|
|
2330
2445
|
validate_dates_distribution(df, self.fit_search_keys, self.logger, self.bundle, self.warning_counter)
|
|
2331
2446
|
|
|
@@ -2334,7 +2449,12 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2334
2449
|
|
|
2335
2450
|
self.__adjust_cv(df, maybe_date_column, model_task_type)
|
|
2336
2451
|
|
|
2337
|
-
|
|
2452
|
+
normalizer = Normalizer(
|
|
2453
|
+
self.fit_search_keys, self.fit_generated_features, self.bundle, self.logger, self.warning_counter
|
|
2454
|
+
)
|
|
2455
|
+
df = normalizer.normalize(df)
|
|
2456
|
+
columns_renaming = normalizer.columns_renaming
|
|
2457
|
+
self.fit_columns_renaming = columns_renaming
|
|
2338
2458
|
|
|
2339
2459
|
df = remove_fintech_duplicates(
|
|
2340
2460
|
df, self.fit_search_keys, date_format=self.date_format, logger=self.logger, bundle=self.bundle
|
|
@@ -2342,38 +2462,58 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2342
2462
|
df = clean_full_duplicates(df, self.logger, bundle=self.bundle)
|
|
2343
2463
|
|
|
2344
2464
|
# Explode multiple search keys
|
|
2345
|
-
|
|
2346
|
-
meaning_types = {
|
|
2347
|
-
**{col: key.value for col, key in self.fit_search_keys.items()},
|
|
2348
|
-
**{str(c): FileColumnMeaningType.FEATURE for c in df.columns if c not in non_feature_columns},
|
|
2349
|
-
}
|
|
2350
|
-
meaning_types[self.TARGET_NAME] = FileColumnMeaningType.TARGET
|
|
2351
|
-
if eval_set is not None and len(eval_set) > 0:
|
|
2352
|
-
meaning_types[EVAL_SET_INDEX] = FileColumnMeaningType.EVAL_SET_INDEX
|
|
2353
|
-
df = self.__add_fit_system_record_id(df, meaning_types, self.fit_search_keys, ENTITY_SYSTEM_RECORD_ID)
|
|
2465
|
+
df = self.__add_fit_system_record_id(df, self.fit_search_keys, ENTITY_SYSTEM_RECORD_ID)
|
|
2354
2466
|
|
|
2355
2467
|
# TODO check that this is correct for enrichment
|
|
2356
2468
|
self.df_with_original_index = df.copy()
|
|
2469
|
+
# TODO check maybe need to drop _time column from df_with_original_index
|
|
2357
2470
|
|
|
2358
|
-
df, unnest_search_keys = self._explode_multiple_search_keys(df, self.fit_search_keys)
|
|
2471
|
+
df, unnest_search_keys = self._explode_multiple_search_keys(df, self.fit_search_keys, columns_renaming)
|
|
2359
2472
|
|
|
2360
2473
|
# Convert EMAIL to HEM after unnesting to do it only with one column
|
|
2361
2474
|
email_column = self._get_email_column(self.fit_search_keys)
|
|
2362
2475
|
hem_column = self._get_hem_column(self.fit_search_keys)
|
|
2363
|
-
email_converted_to_hem = False
|
|
2364
2476
|
if email_column:
|
|
2365
2477
|
converter = EmailSearchKeyConverter(
|
|
2366
|
-
email_column,
|
|
2478
|
+
email_column,
|
|
2479
|
+
hem_column,
|
|
2480
|
+
self.fit_search_keys,
|
|
2481
|
+
columns_renaming,
|
|
2482
|
+
list(unnest_search_keys.keys()),
|
|
2483
|
+
self.logger,
|
|
2367
2484
|
)
|
|
2368
2485
|
df = converter.convert(df)
|
|
2369
|
-
|
|
2370
|
-
|
|
2486
|
+
|
|
2487
|
+
ip_column = self._get_ip_column(self.fit_search_keys)
|
|
2488
|
+
if ip_column:
|
|
2489
|
+
converter = IpSearchKeyConverter(
|
|
2490
|
+
ip_column,
|
|
2491
|
+
self.fit_search_keys,
|
|
2492
|
+
columns_renaming,
|
|
2493
|
+
list(unnest_search_keys.keys()),
|
|
2494
|
+
self.bundle,
|
|
2495
|
+
self.logger,
|
|
2496
|
+
)
|
|
2497
|
+
df = converter.convert(df)
|
|
2498
|
+
|
|
2499
|
+
phone_column = self._get_phone_column(self.fit_search_keys)
|
|
2500
|
+
country_column = self._get_country_column(self.fit_search_keys)
|
|
2501
|
+
if phone_column:
|
|
2502
|
+
converter = PhoneSearchKeyConverter(phone_column, country_column)
|
|
2503
|
+
df = converter.convert(df)
|
|
2504
|
+
|
|
2505
|
+
if country_column:
|
|
2506
|
+
converter = CountrySearchKeyConverter(country_column)
|
|
2507
|
+
df = converter.convert(df)
|
|
2508
|
+
|
|
2509
|
+
postal_code = self._get_postal_column(self.fit_search_keys)
|
|
2510
|
+
if postal_code:
|
|
2511
|
+
converter = PostalCodeSearchKeyConverter(postal_code)
|
|
2512
|
+
df = converter.convert(df)
|
|
2371
2513
|
|
|
2372
2514
|
non_feature_columns = [self.TARGET_NAME, EVAL_SET_INDEX, ENTITY_SYSTEM_RECORD_ID, SEARCH_KEY_UNNEST] + list(
|
|
2373
2515
|
self.fit_search_keys.keys()
|
|
2374
2516
|
)
|
|
2375
|
-
if email_converted_to_hem:
|
|
2376
|
-
non_feature_columns.append(email_column)
|
|
2377
2517
|
if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
|
|
2378
2518
|
non_feature_columns.append(DateTimeSearchKeyConverter.DATETIME_COL)
|
|
2379
2519
|
|
|
@@ -2385,9 +2525,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2385
2525
|
self.fit_dropped_features.update(features_to_drop)
|
|
2386
2526
|
df = df.drop(columns=features_to_drop)
|
|
2387
2527
|
|
|
2388
|
-
if email_converted_to_hem:
|
|
2389
|
-
self.fit_dropped_features.add(email_column)
|
|
2390
|
-
|
|
2391
2528
|
self.fit_generated_features = [f for f in self.fit_generated_features if f not in self.fit_dropped_features]
|
|
2392
2529
|
|
|
2393
2530
|
meaning_types = {
|
|
@@ -2401,7 +2538,12 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2401
2538
|
if eval_set is not None and len(eval_set) > 0:
|
|
2402
2539
|
meaning_types[EVAL_SET_INDEX] = FileColumnMeaningType.EVAL_SET_INDEX
|
|
2403
2540
|
|
|
2404
|
-
df = self.__add_fit_system_record_id(df,
|
|
2541
|
+
df = self.__add_fit_system_record_id(df, self.fit_search_keys, SYSTEM_RECORD_ID)
|
|
2542
|
+
|
|
2543
|
+
if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
|
|
2544
|
+
df = df.drop(columns=DateTimeSearchKeyConverter.DATETIME_COL)
|
|
2545
|
+
|
|
2546
|
+
meaning_types[SYSTEM_RECORD_ID] = FileColumnMeaningType.SYSTEM_RECORD_ID
|
|
2405
2547
|
|
|
2406
2548
|
df = df.reset_index(drop=True).sort_values(by=SYSTEM_RECORD_ID).reset_index(drop=True)
|
|
2407
2549
|
|
|
@@ -2419,8 +2561,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2419
2561
|
rest_client=self.rest_client,
|
|
2420
2562
|
logger=self.logger,
|
|
2421
2563
|
)
|
|
2422
|
-
|
|
2423
|
-
dataset.ignore_columns = [email_column]
|
|
2564
|
+
dataset.columns_renaming = columns_renaming
|
|
2424
2565
|
|
|
2425
2566
|
self.passed_features = [
|
|
2426
2567
|
column for column, meaning_type in meaning_types.items() if meaning_type == FileColumnMeaningType.FEATURE
|
|
@@ -2809,7 +2950,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2809
2950
|
if DateTimeSearchKeyConverter.DATETIME_COL in X.columns:
|
|
2810
2951
|
date_column = DateTimeSearchKeyConverter.DATETIME_COL
|
|
2811
2952
|
else:
|
|
2812
|
-
date_column =
|
|
2953
|
+
date_column = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
|
|
2813
2954
|
sort_columns = [date_column] if date_column is not None else []
|
|
2814
2955
|
|
|
2815
2956
|
# Xy = pd.concat([X, y], axis=1)
|
|
@@ -2905,10 +3046,10 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2905
3046
|
|
|
2906
3047
|
do_without_pandas_limits(print_datasets_sample)
|
|
2907
3048
|
|
|
2908
|
-
maybe_date_col =
|
|
3049
|
+
maybe_date_col = SearchKey.find_key(self.search_keys, [SearchKey.DATE, SearchKey.DATETIME])
|
|
2909
3050
|
if X is not None and maybe_date_col is not None and maybe_date_col in X.columns:
|
|
2910
3051
|
# TODO cast date column to single dtype
|
|
2911
|
-
date_converter = DateTimeSearchKeyConverter(maybe_date_col, self.date_format)
|
|
3052
|
+
date_converter = DateTimeSearchKeyConverter(maybe_date_col, self.date_format, silent_mode=True)
|
|
2912
3053
|
converted_X = date_converter.convert(X)
|
|
2913
3054
|
min_date = converted_X[maybe_date_col].min()
|
|
2914
3055
|
max_date = converted_X[maybe_date_col].max()
|
|
@@ -2935,12 +3076,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2935
3076
|
|
|
2936
3077
|
return df
|
|
2937
3078
|
|
|
2938
|
-
@staticmethod
|
|
2939
|
-
def _get_date_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
|
|
2940
|
-
for col, t in search_keys.items():
|
|
2941
|
-
if t in [SearchKey.DATE, SearchKey.DATETIME]:
|
|
2942
|
-
return col
|
|
2943
|
-
|
|
2944
3079
|
@staticmethod
|
|
2945
3080
|
def _add_current_date_as_key(
|
|
2946
3081
|
df: pd.DataFrame, search_keys: Dict[str, SearchKey], logger: logging.Logger, bundle: ResourceBundle
|
|
@@ -2956,7 +3091,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2956
3091
|
logger.warning(msg)
|
|
2957
3092
|
df[FeaturesEnricher.CURRENT_DATE] = datetime.date.today()
|
|
2958
3093
|
search_keys[FeaturesEnricher.CURRENT_DATE] = SearchKey.DATE
|
|
2959
|
-
converter = DateTimeSearchKeyConverter(FeaturesEnricher.CURRENT_DATE,
|
|
3094
|
+
converter = DateTimeSearchKeyConverter(FeaturesEnricher.CURRENT_DATE, silent_mode=True)
|
|
2960
3095
|
df = converter.convert(df)
|
|
2961
3096
|
return df
|
|
2962
3097
|
|
|
@@ -2984,17 +3119,37 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2984
3119
|
if len(cols) == 1:
|
|
2985
3120
|
return cols[0]
|
|
2986
3121
|
|
|
3122
|
+
@staticmethod
|
|
3123
|
+
def _get_ip_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
|
|
3124
|
+
cols = [col for col, t in search_keys.items() if t == SearchKey.IP]
|
|
3125
|
+
if len(cols) > 1:
|
|
3126
|
+
raise Exception("More than one ip column found after unnest")
|
|
3127
|
+
if len(cols) == 1:
|
|
3128
|
+
return cols[0]
|
|
3129
|
+
|
|
2987
3130
|
@staticmethod
|
|
2988
3131
|
def _get_phone_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
|
|
2989
3132
|
for col, t in search_keys.items():
|
|
2990
3133
|
if t == SearchKey.PHONE:
|
|
2991
3134
|
return col
|
|
2992
3135
|
|
|
3136
|
+
@staticmethod
|
|
3137
|
+
def _get_country_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
|
|
3138
|
+
for col, t in search_keys.items():
|
|
3139
|
+
if t == SearchKey.COUNTRY:
|
|
3140
|
+
return col
|
|
3141
|
+
|
|
3142
|
+
@staticmethod
|
|
3143
|
+
def _get_postal_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
|
|
3144
|
+
for col, t in search_keys.items():
|
|
3145
|
+
if t == SearchKey.POSTAL_CODE:
|
|
3146
|
+
return col
|
|
3147
|
+
|
|
2993
3148
|
def _explode_multiple_search_keys(
|
|
2994
|
-
self, df: pd.DataFrame, search_keys: Dict[str, SearchKey]
|
|
3149
|
+
self, df: pd.DataFrame, search_keys: Dict[str, SearchKey], columns_renaming: Dict[str, str]
|
|
2995
3150
|
) -> Tuple[pd.DataFrame, Dict[str, List[str]]]:
|
|
2996
3151
|
# find groups of multiple search keys
|
|
2997
|
-
search_key_names_by_type: Dict[SearchKey, str] = {}
|
|
3152
|
+
search_key_names_by_type: Dict[SearchKey, List[str]] = {}
|
|
2998
3153
|
for key_name, key_type in search_keys.items():
|
|
2999
3154
|
search_key_names_by_type[key_type] = search_key_names_by_type.get(key_type, []) + [key_name]
|
|
3000
3155
|
search_key_names_by_type = {
|
|
@@ -3018,6 +3173,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3018
3173
|
del search_keys[old_key]
|
|
3019
3174
|
search_keys[new_search_key] = key_type
|
|
3020
3175
|
unnest_search_keys[new_search_key] = key_names
|
|
3176
|
+
columns_renaming[new_search_key] = new_search_key
|
|
3021
3177
|
|
|
3022
3178
|
df = pd.concat(exploded_dfs, ignore_index=True)
|
|
3023
3179
|
return df, unnest_search_keys
|
|
@@ -3025,7 +3181,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3025
3181
|
def __add_fit_system_record_id(
|
|
3026
3182
|
self,
|
|
3027
3183
|
df: pd.DataFrame,
|
|
3028
|
-
meaning_types: Dict[str, FileColumnMeaningType],
|
|
3184
|
+
# meaning_types: Dict[str, FileColumnMeaningType],
|
|
3029
3185
|
search_keys: Dict[str, SearchKey],
|
|
3030
3186
|
id_name: str,
|
|
3031
3187
|
) -> pd.DataFrame:
|
|
@@ -3048,9 +3204,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3048
3204
|
]
|
|
3049
3205
|
if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
|
|
3050
3206
|
date_column = DateTimeSearchKeyConverter.DATETIME_COL
|
|
3051
|
-
sort_exclude_columns.append(
|
|
3207
|
+
sort_exclude_columns.append(SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME]))
|
|
3052
3208
|
else:
|
|
3053
|
-
date_column =
|
|
3209
|
+
date_column = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
|
|
3054
3210
|
sort_columns = [date_column] if date_column is not None else []
|
|
3055
3211
|
|
|
3056
3212
|
other_columns = sorted(
|
|
@@ -3059,13 +3215,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3059
3215
|
for c in df.columns
|
|
3060
3216
|
if c not in sort_columns and c not in sort_exclude_columns and df[c].nunique() > 1
|
|
3061
3217
|
]
|
|
3062
|
-
# [
|
|
3063
|
-
# sk
|
|
3064
|
-
# for sk, key_type in search_keys.items()
|
|
3065
|
-
# if key_type not in [SearchKey.DATE, SearchKey.DATETIME]
|
|
3066
|
-
# and sk in df.columns
|
|
3067
|
-
# and df[sk].nunique() > 1 # don't use constant keys for hash
|
|
3068
|
-
# ]
|
|
3069
3218
|
)
|
|
3070
3219
|
|
|
3071
3220
|
search_keys_hash = "search_keys_hash"
|
|
@@ -3078,9 +3227,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3078
3227
|
if search_keys_hash in df.columns:
|
|
3079
3228
|
df.drop(columns=search_keys_hash, inplace=True)
|
|
3080
3229
|
|
|
3081
|
-
if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
|
|
3082
|
-
df.drop(columns=DateTimeSearchKeyConverter.DATETIME_COL, inplace=True)
|
|
3083
|
-
|
|
3084
3230
|
df = df.reset_index(drop=True).reset_index()
|
|
3085
3231
|
# system_record_id saves correct order for fit
|
|
3086
3232
|
df = df.rename(columns={DEFAULT_INDEX: id_name})
|
|
@@ -3090,11 +3236,11 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3090
3236
|
df.index.name = original_index_name
|
|
3091
3237
|
df = df.sort_values(by=original_order_name).drop(columns=original_order_name)
|
|
3092
3238
|
|
|
3093
|
-
meaning_types[id_name] = (
|
|
3094
|
-
|
|
3095
|
-
|
|
3096
|
-
|
|
3097
|
-
)
|
|
3239
|
+
# meaning_types[id_name] = (
|
|
3240
|
+
# FileColumnMeaningType.SYSTEM_RECORD_ID
|
|
3241
|
+
# if id_name == SYSTEM_RECORD_ID
|
|
3242
|
+
# else FileColumnMeaningType.ENTITY_SYSTEM_RECORD_ID
|
|
3243
|
+
# )
|
|
3098
3244
|
return df
|
|
3099
3245
|
|
|
3100
3246
|
def __correct_target(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
@@ -3472,10 +3618,13 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3472
3618
|
for _, key_type in search_keys.items():
|
|
3473
3619
|
if not isinstance(key_type, SearchKey):
|
|
3474
3620
|
raise ValidationError(self.bundle.get("unsupported_type_of_search_key").format(key_type))
|
|
3621
|
+
|
|
3475
3622
|
valid_search_keys = {}
|
|
3476
3623
|
unsupported_search_keys = {
|
|
3477
3624
|
SearchKey.IP_RANGE_FROM,
|
|
3478
3625
|
SearchKey.IP_RANGE_TO,
|
|
3626
|
+
SearchKey.IPV6_RANGE_FROM,
|
|
3627
|
+
SearchKey.IPV6_RANGE_TO,
|
|
3479
3628
|
SearchKey.MSISDN_RANGE_FROM,
|
|
3480
3629
|
SearchKey.MSISDN_RANGE_TO,
|
|
3481
3630
|
# SearchKey.EMAIL_ONE_DOMAIN,
|
|
@@ -3565,6 +3714,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3565
3714
|
print(msg)
|
|
3566
3715
|
self.logger.warning(msg)
|
|
3567
3716
|
self.warning_counter.increment()
|
|
3717
|
+
# TODO maybe raise ValidationError
|
|
3568
3718
|
|
|
3569
3719
|
self.logger.info(f"Prepared search keys: {valid_search_keys}")
|
|
3570
3720
|
|