upgini 1.1.312a5__py3-none-any.whl → 1.1.313__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- upgini/__about__.py +1 -1
- upgini/autofe/all_operands.py +26 -7
- upgini/autofe/binary.py +95 -4
- upgini/autofe/date.py +26 -6
- upgini/autofe/feature.py +25 -11
- upgini/autofe/unary.py +7 -0
- upgini/dataset.py +386 -33
- upgini/features_enricher.py +145 -295
- upgini/metadata.py +1 -16
- upgini/normalizer/phone_normalizer.py +340 -0
- upgini/utils/country_utils.py +0 -16
- upgini/utils/datetime_utils.py +16 -38
- upgini/utils/email_utils.py +17 -49
- upgini/utils/ip_utils.py +1 -100
- upgini/utils/phone_utils.py +0 -345
- upgini/utils/postal_code_utils.py +0 -34
- {upgini-1.1.312a5.dist-info → upgini-1.1.313.dist-info}/METADATA +3 -1
- {upgini-1.1.312a5.dist-info → upgini-1.1.313.dist-info}/RECORD +20 -20
- {upgini-1.1.312a5.dist-info → upgini-1.1.313.dist-info}/WHEEL +1 -1
- upgini/normalizer/normalize_utils.py +0 -203
- {upgini-1.1.312a5.dist-info → upgini-1.1.313.dist-info}/licenses/LICENSE +0 -0
upgini/features_enricher.py
CHANGED
|
@@ -61,15 +61,11 @@ from upgini.metadata import (
|
|
|
61
61
|
SearchKey,
|
|
62
62
|
)
|
|
63
63
|
from upgini.metrics import EstimatorWrapper, validate_scoring_argument
|
|
64
|
-
from upgini.normalizer.normalize_utils import Normalizer
|
|
65
64
|
from upgini.resource_bundle import ResourceBundle, bundle, get_custom_bundle
|
|
66
65
|
from upgini.search_task import SearchTask
|
|
67
66
|
from upgini.spinner import Spinner
|
|
68
67
|
from upgini.utils import combine_search_keys, find_numbers_with_decimal_comma
|
|
69
|
-
from upgini.utils.country_utils import
|
|
70
|
-
CountrySearchKeyConverter,
|
|
71
|
-
CountrySearchKeyDetector,
|
|
72
|
-
)
|
|
68
|
+
from upgini.utils.country_utils import CountrySearchKeyDetector
|
|
73
69
|
from upgini.utils.custom_loss_utils import (
|
|
74
70
|
get_additional_params_custom_loss,
|
|
75
71
|
get_runtime_params_custom_loss,
|
|
@@ -91,19 +87,11 @@ from upgini.utils.display_utils import (
|
|
|
91
87
|
prepare_and_show_report,
|
|
92
88
|
show_request_quote_button,
|
|
93
89
|
)
|
|
94
|
-
from upgini.utils.email_utils import
|
|
95
|
-
EmailDomainGenerator,
|
|
96
|
-
EmailSearchKeyConverter,
|
|
97
|
-
EmailSearchKeyDetector,
|
|
98
|
-
)
|
|
90
|
+
from upgini.utils.email_utils import EmailSearchKeyConverter, EmailSearchKeyDetector
|
|
99
91
|
from upgini.utils.features_validator import FeaturesValidator
|
|
100
92
|
from upgini.utils.format import Format
|
|
101
|
-
from upgini.utils.
|
|
102
|
-
from upgini.utils.
|
|
103
|
-
from upgini.utils.postal_code_utils import (
|
|
104
|
-
PostalCodeSearchKeyConverter,
|
|
105
|
-
PostalCodeSearchKeyDetector,
|
|
106
|
-
)
|
|
93
|
+
from upgini.utils.phone_utils import PhoneSearchKeyDetector
|
|
94
|
+
from upgini.utils.postal_code_utils import PostalCodeSearchKeyDetector
|
|
107
95
|
|
|
108
96
|
try:
|
|
109
97
|
from upgini.utils.progress_bar import CustomProgressBar as ProgressBar
|
|
@@ -249,7 +237,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
249
237
|
|
|
250
238
|
self.passed_features: List[str] = []
|
|
251
239
|
self.df_with_original_index: Optional[pd.DataFrame] = None
|
|
252
|
-
self.fit_columns_renaming: Optional[Dict[str, str]] = None
|
|
253
240
|
self.country_added = False
|
|
254
241
|
self.fit_generated_features: List[str] = []
|
|
255
242
|
self.fit_dropped_features: Set[str] = set()
|
|
@@ -260,7 +247,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
260
247
|
self.eval_set: Optional[List[Tuple]] = None
|
|
261
248
|
self.autodetected_search_keys: Dict[str, SearchKey] = {}
|
|
262
249
|
self.imbalanced = False
|
|
263
|
-
self.__cached_sampled_datasets: Optional[Tuple[pd.DataFrame, pd.DataFrame, pd.Series, Dict, Dict
|
|
250
|
+
self.__cached_sampled_datasets: Optional[Tuple[pd.DataFrame, pd.DataFrame, pd.Series, Dict, Dict]] = None
|
|
264
251
|
|
|
265
252
|
validate_version(self.logger)
|
|
266
253
|
self.search_keys = search_keys or {}
|
|
@@ -719,7 +706,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
719
706
|
|
|
720
707
|
start_time = time.time()
|
|
721
708
|
try:
|
|
722
|
-
result
|
|
709
|
+
result = self.__inner_transform(
|
|
723
710
|
trace_id,
|
|
724
711
|
X,
|
|
725
712
|
exclude_features_sources=exclude_features_sources,
|
|
@@ -919,16 +906,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
919
906
|
search_keys,
|
|
920
907
|
groups,
|
|
921
908
|
_cv,
|
|
922
|
-
columns_renaming,
|
|
923
909
|
) = prepared_data
|
|
924
910
|
|
|
925
|
-
# rename cat_features
|
|
926
|
-
if cat_features:
|
|
927
|
-
for new_c, old_c in columns_renaming.items():
|
|
928
|
-
if old_c in cat_features:
|
|
929
|
-
cat_features.remove(old_c)
|
|
930
|
-
cat_features.append(new_c)
|
|
931
|
-
|
|
932
911
|
gc.collect()
|
|
933
912
|
|
|
934
913
|
print(self.bundle.get("metrics_start"))
|
|
@@ -941,7 +920,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
941
920
|
|
|
942
921
|
self._check_train_and_eval_target_distribution(y_sorted, fitting_eval_set_dict)
|
|
943
922
|
|
|
944
|
-
has_date =
|
|
923
|
+
has_date = self._get_date_column(search_keys) is not None
|
|
945
924
|
model_task_type = self.model_task_type or define_task(y_sorted, has_date, self.logger, silent=True)
|
|
946
925
|
|
|
947
926
|
wrapper = EstimatorWrapper.create(
|
|
@@ -1034,7 +1013,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1034
1013
|
self.bundle.get("quality_metrics_rows_header"): _num_samples(effective_X),
|
|
1035
1014
|
}
|
|
1036
1015
|
if model_task_type in [ModelTaskType.BINARY, ModelTaskType.REGRESSION] and is_numeric_dtype(
|
|
1037
|
-
|
|
1016
|
+
y_sorted
|
|
1038
1017
|
):
|
|
1039
1018
|
train_metrics[self.bundle.get("quality_metrics_mean_target_header")] = round(
|
|
1040
1019
|
np.mean(effective_y), 4
|
|
@@ -1107,7 +1086,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1107
1086
|
# self.bundle.get("quality_metrics_match_rate_header"): eval_hit_rate,
|
|
1108
1087
|
}
|
|
1109
1088
|
if model_task_type in [ModelTaskType.BINARY, ModelTaskType.REGRESSION] and is_numeric_dtype(
|
|
1110
|
-
|
|
1089
|
+
eval_y_sorted
|
|
1111
1090
|
):
|
|
1112
1091
|
eval_metrics[self.bundle.get("quality_metrics_mean_target_header")] = round(
|
|
1113
1092
|
np.mean(effective_eval_set[idx][1]), 4
|
|
@@ -1134,7 +1113,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1134
1113
|
)
|
|
1135
1114
|
|
|
1136
1115
|
uplift_col = self.bundle.get("quality_metrics_uplift_header")
|
|
1137
|
-
date_column =
|
|
1116
|
+
date_column = self._get_date_column(search_keys)
|
|
1138
1117
|
if (
|
|
1139
1118
|
uplift_col in metrics_df.columns
|
|
1140
1119
|
and (metrics_df[uplift_col] < 0).any()
|
|
@@ -1216,6 +1195,27 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1216
1195
|
def _has_paid_features(self, exclude_features_sources: Optional[List[str]]) -> bool:
|
|
1217
1196
|
return self._has_features_with_commercial_schema(CommercialSchema.PAID.value, exclude_features_sources)
|
|
1218
1197
|
|
|
1198
|
+
def _extend_x(self, x: pd.DataFrame, is_demo_dataset: bool) -> Tuple[pd.DataFrame, Dict[str, SearchKey]]:
|
|
1199
|
+
search_keys = self.search_keys.copy()
|
|
1200
|
+
search_keys = self.__prepare_search_keys(x, search_keys, is_demo_dataset, is_transform=True, silent_mode=True)
|
|
1201
|
+
|
|
1202
|
+
extended_X = x.copy()
|
|
1203
|
+
generated_features = []
|
|
1204
|
+
date_column = self._get_date_column(search_keys)
|
|
1205
|
+
if date_column is not None:
|
|
1206
|
+
converter = DateTimeSearchKeyConverter(date_column, self.date_format, self.logger, self.bundle)
|
|
1207
|
+
extended_X = converter.convert(extended_X, keep_time=True)
|
|
1208
|
+
generated_features.extend(converter.generated_features)
|
|
1209
|
+
email_column = self._get_email_column(search_keys)
|
|
1210
|
+
hem_column = self._get_hem_column(search_keys)
|
|
1211
|
+
if email_column:
|
|
1212
|
+
converter = EmailSearchKeyConverter(email_column, hem_column, search_keys, [], self.logger)
|
|
1213
|
+
extended_X = converter.convert(extended_X)
|
|
1214
|
+
generated_features.extend(converter.generated_features)
|
|
1215
|
+
generated_features = [f for f in generated_features if f in self.fit_generated_features]
|
|
1216
|
+
|
|
1217
|
+
return extended_X, search_keys
|
|
1218
|
+
|
|
1219
1219
|
def _is_input_same_as_fit(
|
|
1220
1220
|
self,
|
|
1221
1221
|
X: Union[pd.DataFrame, pd.Series, np.ndarray, None] = None,
|
|
@@ -1259,7 +1259,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1259
1259
|
groups = None
|
|
1260
1260
|
|
|
1261
1261
|
if not isinstance(_cv, BaseCrossValidator):
|
|
1262
|
-
date_column =
|
|
1262
|
+
date_column = self._get_date_column(search_keys)
|
|
1263
1263
|
date_series = X[date_column] if date_column is not None else None
|
|
1264
1264
|
_cv, groups = CVConfig(
|
|
1265
1265
|
_cv, date_series, self.random_state, self._search_task.get_shuffle_kfold(), group_columns=group_columns
|
|
@@ -1282,7 +1282,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1282
1282
|
|
|
1283
1283
|
def _get_client_cat_features(
|
|
1284
1284
|
self, estimator: Optional[Any], X: pd.DataFrame, search_keys: Dict[str, SearchKey]
|
|
1285
|
-
) ->
|
|
1285
|
+
) -> Optional[List[str]]:
|
|
1286
1286
|
cat_features = None
|
|
1287
1287
|
search_keys_for_metrics = []
|
|
1288
1288
|
if (
|
|
@@ -1342,15 +1342,11 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1342
1342
|
progress_bar,
|
|
1343
1343
|
progress_callback,
|
|
1344
1344
|
)
|
|
1345
|
-
X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys
|
|
1346
|
-
sampled_data
|
|
1347
|
-
)
|
|
1345
|
+
X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys = dataclasses.astuple(sampled_data)
|
|
1348
1346
|
|
|
1349
1347
|
excluding_search_keys = list(search_keys.keys())
|
|
1350
1348
|
if search_keys_for_metrics is not None and len(search_keys_for_metrics) > 0:
|
|
1351
|
-
for sk in excluding_search_keys
|
|
1352
|
-
if columns_renaming.get(sk) in search_keys_for_metrics:
|
|
1353
|
-
excluding_search_keys.remove(sk)
|
|
1349
|
+
excluding_search_keys = [sk for sk in excluding_search_keys if sk not in search_keys_for_metrics]
|
|
1354
1350
|
|
|
1355
1351
|
client_features = [
|
|
1356
1352
|
c
|
|
@@ -1367,7 +1363,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1367
1363
|
importance_threshold,
|
|
1368
1364
|
max_features,
|
|
1369
1365
|
)
|
|
1370
|
-
filtered_enriched_features = [c for c in filtered_enriched_features if c not in client_features]
|
|
1371
1366
|
|
|
1372
1367
|
X_sorted, y_sorted = self._sort_by_system_record_id(X_sampled, y_sampled, self.cv)
|
|
1373
1368
|
enriched_X_sorted, enriched_y_sorted = self._sort_by_system_record_id(enriched_X, y_sampled, self.cv)
|
|
@@ -1397,7 +1392,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1397
1392
|
fitting_X = fitting_X.drop(columns=constant_columns, errors="ignore")
|
|
1398
1393
|
fitting_enriched_X = fitting_enriched_X.drop(columns=constant_columns, errors="ignore")
|
|
1399
1394
|
|
|
1400
|
-
# TODO maybe there is no more need for these convertions
|
|
1401
1395
|
# Remove datetime features
|
|
1402
1396
|
datetime_features = [
|
|
1403
1397
|
f for f in fitting_X.columns if is_datetime64_any_dtype(fitting_X[f]) or is_period_dtype(fitting_X[f])
|
|
@@ -1485,7 +1479,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1485
1479
|
search_keys,
|
|
1486
1480
|
groups,
|
|
1487
1481
|
cv,
|
|
1488
|
-
columns_renaming,
|
|
1489
1482
|
)
|
|
1490
1483
|
|
|
1491
1484
|
@dataclass
|
|
@@ -1495,7 +1488,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1495
1488
|
enriched_X: pd.DataFrame
|
|
1496
1489
|
eval_set_sampled_dict: Dict[int, Tuple[pd.DataFrame, pd.Series]]
|
|
1497
1490
|
search_keys: Dict[str, SearchKey]
|
|
1498
|
-
columns_renaming: Dict[str, str]
|
|
1499
1491
|
|
|
1500
1492
|
def _sample_data_for_metrics(
|
|
1501
1493
|
self,
|
|
@@ -1535,15 +1527,11 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1535
1527
|
)
|
|
1536
1528
|
|
|
1537
1529
|
def __get_sampled_cached_enriched(self, exclude_features_sources: Optional[List[str]]) -> _SampledDataForMetrics:
|
|
1538
|
-
X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys
|
|
1539
|
-
self.__cached_sampled_datasets
|
|
1540
|
-
)
|
|
1530
|
+
X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys = self.__cached_sampled_datasets
|
|
1541
1531
|
if exclude_features_sources:
|
|
1542
1532
|
enriched_X = enriched_X.drop(columns=exclude_features_sources, errors="ignore")
|
|
1543
1533
|
|
|
1544
|
-
return self.__mk_sampled_data_tuple(
|
|
1545
|
-
X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys, columns_renaming
|
|
1546
|
-
)
|
|
1534
|
+
return self.__mk_sampled_data_tuple(X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys)
|
|
1547
1535
|
|
|
1548
1536
|
def __sample_only_input(
|
|
1549
1537
|
self, validated_X: pd.DataFrame, validated_y: pd.Series, eval_set: Optional[List[tuple]], is_demo_dataset: bool
|
|
@@ -1561,28 +1549,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1561
1549
|
eval_xy[EVAL_SET_INDEX] = idx + 1
|
|
1562
1550
|
df = pd.concat([df, eval_xy])
|
|
1563
1551
|
|
|
1564
|
-
search_keys = self.search_keys.copy()
|
|
1565
|
-
search_keys = self.__prepare_search_keys(df, search_keys, is_demo_dataset, is_transform=True, silent_mode=True)
|
|
1566
|
-
|
|
1567
|
-
date_column = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
|
|
1568
|
-
generated_features = []
|
|
1569
|
-
if date_column is not None:
|
|
1570
|
-
converter = DateTimeSearchKeyConverter(
|
|
1571
|
-
date_column, self.date_format, self.logger, self.bundle, silent_mode=True
|
|
1572
|
-
)
|
|
1573
|
-
df = converter.convert(df, keep_time=True)
|
|
1574
|
-
generated_features = converter.generated_features
|
|
1575
|
-
|
|
1576
|
-
email_columns = SearchKey.find_all_keys(search_keys, SearchKey.EMAIL)
|
|
1577
|
-
if email_columns:
|
|
1578
|
-
generator = EmailDomainGenerator(email_columns)
|
|
1579
|
-
df = generator.generate(df)
|
|
1580
|
-
generated_features.extend(generator.generated_features)
|
|
1581
|
-
|
|
1582
|
-
normalizer = Normalizer(self.search_keys, generated_features, self.bundle, self.logger, self.warning_counter)
|
|
1583
|
-
df = normalizer.normalize(df)
|
|
1584
|
-
columns_renaming = normalizer.columns_renaming
|
|
1585
|
-
|
|
1586
1552
|
df = clean_full_duplicates(df, logger=self.logger, silent=True, bundle=self.bundle)
|
|
1587
1553
|
|
|
1588
1554
|
num_samples = _num_samples(df)
|
|
@@ -1595,34 +1561,24 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1595
1561
|
self.logger.info(f"Downsampling from {num_samples} to {sample_rows}")
|
|
1596
1562
|
df = df.sample(n=sample_rows, random_state=self.random_state)
|
|
1597
1563
|
|
|
1598
|
-
|
|
1599
|
-
|
|
1600
|
-
df = df.drop(columns=DateTimeSearchKeyConverter.DATETIME_COL)
|
|
1564
|
+
df_extended, search_keys = self._extend_x(df, is_demo_dataset)
|
|
1565
|
+
df_extended = self.__add_fit_system_record_id(df_extended, {}, search_keys, SYSTEM_RECORD_ID)
|
|
1601
1566
|
|
|
1602
|
-
train_df =
|
|
1567
|
+
train_df = df_extended.query(f"{EVAL_SET_INDEX} == 0") if eval_set is not None else df_extended
|
|
1603
1568
|
X_sampled = train_df.drop(columns=[TARGET, EVAL_SET_INDEX], errors="ignore")
|
|
1604
1569
|
y_sampled = train_df[TARGET].copy()
|
|
1605
1570
|
enriched_X = X_sampled
|
|
1606
1571
|
|
|
1607
1572
|
if eval_set is not None:
|
|
1608
1573
|
for idx in range(len(eval_set)):
|
|
1609
|
-
eval_xy_sampled =
|
|
1574
|
+
eval_xy_sampled = df_extended.query(f"{EVAL_SET_INDEX} == {idx + 1}")
|
|
1610
1575
|
eval_X_sampled = eval_xy_sampled.drop(columns=[TARGET, EVAL_SET_INDEX], errors="ignore")
|
|
1611
1576
|
eval_y_sampled = eval_xy_sampled[TARGET].copy()
|
|
1612
1577
|
enriched_eval_X = eval_X_sampled
|
|
1613
1578
|
eval_set_sampled_dict[idx] = (eval_X_sampled, enriched_eval_X, eval_y_sampled)
|
|
1614
|
-
self.__cached_sampled_datasets = (
|
|
1615
|
-
X_sampled,
|
|
1616
|
-
y_sampled,
|
|
1617
|
-
enriched_X,
|
|
1618
|
-
eval_set_sampled_dict,
|
|
1619
|
-
search_keys,
|
|
1620
|
-
columns_renaming,
|
|
1621
|
-
)
|
|
1579
|
+
self.__cached_sampled_datasets = (X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys)
|
|
1622
1580
|
|
|
1623
|
-
return self.__mk_sampled_data_tuple(
|
|
1624
|
-
X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys, columns_renaming
|
|
1625
|
-
)
|
|
1581
|
+
return self.__mk_sampled_data_tuple(X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys)
|
|
1626
1582
|
|
|
1627
1583
|
def __sample_balanced(
|
|
1628
1584
|
self,
|
|
@@ -1634,7 +1590,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1634
1590
|
search_keys = self.fit_search_keys
|
|
1635
1591
|
|
|
1636
1592
|
rows_to_drop = None
|
|
1637
|
-
has_date =
|
|
1593
|
+
has_date = self._get_date_column(search_keys) is not None
|
|
1638
1594
|
task_type = self.model_task_type or define_task(
|
|
1639
1595
|
self.df_with_original_index[TARGET], has_date, self.logger, silent=True
|
|
1640
1596
|
)
|
|
@@ -1688,18 +1644,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1688
1644
|
enriched_eval_X = enriched_eval_sets[idx + 1].drop(columns=[TARGET, EVAL_SET_INDEX])
|
|
1689
1645
|
eval_set_sampled_dict[idx] = (eval_X_sampled, enriched_eval_X, eval_y_sampled)
|
|
1690
1646
|
|
|
1691
|
-
self.__cached_sampled_datasets = (
|
|
1692
|
-
X_sampled,
|
|
1693
|
-
y_sampled,
|
|
1694
|
-
enriched_X,
|
|
1695
|
-
eval_set_sampled_dict,
|
|
1696
|
-
search_keys,
|
|
1697
|
-
self.fit_columns_renaming,
|
|
1698
|
-
)
|
|
1647
|
+
self.__cached_sampled_datasets = (X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys)
|
|
1699
1648
|
|
|
1700
|
-
return self.__mk_sampled_data_tuple(
|
|
1701
|
-
X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys, self.fit_columns_renaming
|
|
1702
|
-
)
|
|
1649
|
+
return self.__mk_sampled_data_tuple(X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys)
|
|
1703
1650
|
|
|
1704
1651
|
def __sample_imbalanced(
|
|
1705
1652
|
self,
|
|
@@ -1739,7 +1686,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1739
1686
|
tmp_target_name = "__target"
|
|
1740
1687
|
df = df.rename(columns={TARGET: tmp_target_name})
|
|
1741
1688
|
|
|
1742
|
-
enriched_df
|
|
1689
|
+
enriched_df = self.__inner_transform(
|
|
1743
1690
|
trace_id,
|
|
1744
1691
|
df,
|
|
1745
1692
|
exclude_features_sources=exclude_features_sources,
|
|
@@ -1787,7 +1734,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1787
1734
|
tmp_target_name = "__target"
|
|
1788
1735
|
df = df.rename(columns={TARGET: tmp_target_name})
|
|
1789
1736
|
|
|
1790
|
-
enriched_Xy
|
|
1737
|
+
enriched_Xy = self.__inner_transform(
|
|
1791
1738
|
trace_id,
|
|
1792
1739
|
df,
|
|
1793
1740
|
exclude_features_sources=exclude_features_sources,
|
|
@@ -1812,18 +1759,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1812
1759
|
y_sampled = enriched_Xy[TARGET].copy()
|
|
1813
1760
|
enriched_X = enriched_Xy.drop(columns=TARGET)
|
|
1814
1761
|
|
|
1815
|
-
self.__cached_sampled_datasets = (
|
|
1816
|
-
X_sampled,
|
|
1817
|
-
y_sampled,
|
|
1818
|
-
enriched_X,
|
|
1819
|
-
eval_set_sampled_dict,
|
|
1820
|
-
self.search_keys,
|
|
1821
|
-
columns_renaming,
|
|
1822
|
-
)
|
|
1762
|
+
self.__cached_sampled_datasets = (X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, self.search_keys)
|
|
1823
1763
|
|
|
1824
|
-
return self.__mk_sampled_data_tuple(
|
|
1825
|
-
X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, self.search_keys, columns_renaming
|
|
1826
|
-
)
|
|
1764
|
+
return self.__mk_sampled_data_tuple(X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, self.search_keys)
|
|
1827
1765
|
|
|
1828
1766
|
def __mk_sampled_data_tuple(
|
|
1829
1767
|
self,
|
|
@@ -1832,7 +1770,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1832
1770
|
enriched_X: pd.DataFrame,
|
|
1833
1771
|
eval_set_sampled_dict: Dict,
|
|
1834
1772
|
search_keys: Dict,
|
|
1835
|
-
columns_renaming: Dict[str, str],
|
|
1836
1773
|
):
|
|
1837
1774
|
search_keys = {k: v for k, v in search_keys.items() if k in X_sampled.columns.to_list()}
|
|
1838
1775
|
return FeaturesEnricher._SampledDataForMetrics(
|
|
@@ -1841,7 +1778,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1841
1778
|
enriched_X=enriched_X,
|
|
1842
1779
|
eval_set_sampled_dict=eval_set_sampled_dict,
|
|
1843
1780
|
search_keys=search_keys,
|
|
1844
|
-
columns_renaming=columns_renaming,
|
|
1845
1781
|
)
|
|
1846
1782
|
|
|
1847
1783
|
def get_search_id(self) -> Optional[str]:
|
|
@@ -1930,7 +1866,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1930
1866
|
progress_bar: Optional[ProgressBar] = None,
|
|
1931
1867
|
progress_callback: Optional[Callable[[SearchProgress], Any]] = None,
|
|
1932
1868
|
add_fit_system_record_id: bool = False,
|
|
1933
|
-
) ->
|
|
1869
|
+
) -> pd.DataFrame:
|
|
1934
1870
|
if self._search_task is None:
|
|
1935
1871
|
raise NotFittedError(self.bundle.get("transform_unfitted_enricher"))
|
|
1936
1872
|
|
|
@@ -1943,13 +1879,13 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1943
1879
|
|
|
1944
1880
|
if len(self.feature_names_) == 0:
|
|
1945
1881
|
self.logger.warning(self.bundle.get("no_important_features_for_transform"))
|
|
1946
|
-
return X
|
|
1882
|
+
return X
|
|
1947
1883
|
|
|
1948
1884
|
if self._has_paid_features(exclude_features_sources):
|
|
1949
1885
|
msg = self.bundle.get("transform_with_paid_features")
|
|
1950
1886
|
self.logger.warning(msg)
|
|
1951
1887
|
self.__display_support_link(msg)
|
|
1952
|
-
return None
|
|
1888
|
+
return None
|
|
1953
1889
|
|
|
1954
1890
|
if not metrics_calculation:
|
|
1955
1891
|
transform_usage = self.rest_client.get_current_transform_usage(trace_id)
|
|
@@ -1960,7 +1896,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1960
1896
|
self.logger.warning(msg)
|
|
1961
1897
|
print(msg)
|
|
1962
1898
|
show_request_quote_button()
|
|
1963
|
-
return None
|
|
1899
|
+
return None
|
|
1964
1900
|
else:
|
|
1965
1901
|
msg = self.bundle.get("transform_usage_info").format(
|
|
1966
1902
|
transform_usage.limit, transform_usage.transformed_rows
|
|
@@ -1998,11 +1934,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1998
1934
|
df = self.__add_country_code(df, search_keys)
|
|
1999
1935
|
|
|
2000
1936
|
generated_features = []
|
|
2001
|
-
date_column =
|
|
1937
|
+
date_column = self._get_date_column(search_keys)
|
|
2002
1938
|
if date_column is not None:
|
|
2003
|
-
converter = DateTimeSearchKeyConverter(
|
|
2004
|
-
date_column, self.date_format, self.logger, bundle=self.bundle, silent_mode=silent_mode
|
|
2005
|
-
)
|
|
1939
|
+
converter = DateTimeSearchKeyConverter(date_column, self.date_format, self.logger, bundle=self.bundle)
|
|
2006
1940
|
df = converter.convert(df)
|
|
2007
1941
|
self.logger.info(f"Date column after convertion: {df[date_column]}")
|
|
2008
1942
|
generated_features.extend(converter.generated_features)
|
|
@@ -2011,93 +1945,61 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2011
1945
|
if self.add_date_if_missing:
|
|
2012
1946
|
df = self._add_current_date_as_key(df, search_keys, self.logger, self.bundle)
|
|
2013
1947
|
|
|
2014
|
-
email_columns = SearchKey.find_all_keys(search_keys, SearchKey.EMAIL)
|
|
2015
|
-
if email_columns:
|
|
2016
|
-
generator = EmailDomainGenerator(email_columns)
|
|
2017
|
-
df = generator.generate(df)
|
|
2018
|
-
generated_features.extend(generator.generated_features)
|
|
2019
|
-
|
|
2020
|
-
normalizer = Normalizer(
|
|
2021
|
-
search_keys, generated_features, self.bundle, self.logger, self.warning_counter, silent_mode
|
|
2022
|
-
)
|
|
2023
|
-
df = normalizer.normalize(df)
|
|
2024
|
-
columns_renaming = normalizer.columns_renaming
|
|
2025
|
-
|
|
2026
1948
|
# Don't pass all features in backend on transform
|
|
1949
|
+
original_features_for_transform = []
|
|
2027
1950
|
runtime_parameters = self._get_copy_of_runtime_parameters()
|
|
2028
|
-
|
|
2029
|
-
if len(
|
|
2030
|
-
|
|
1951
|
+
features_not_to_pass = [column for column in df.columns if column not in search_keys.keys()]
|
|
1952
|
+
if len(features_not_to_pass) > 0:
|
|
1953
|
+
# Pass only features that need for transform
|
|
1954
|
+
features_for_transform = self._search_task.get_features_for_transform()
|
|
1955
|
+
if features_for_transform is not None and len(features_for_transform) > 0:
|
|
1956
|
+
file_metadata = self._search_task.get_file_metadata(trace_id)
|
|
1957
|
+
original_features_for_transform = [
|
|
1958
|
+
c.originalName or c.name for c in file_metadata.columns if c.name in features_for_transform
|
|
1959
|
+
]
|
|
1960
|
+
|
|
1961
|
+
runtime_parameters.properties["features_for_embeddings"] = ",".join(features_for_transform)
|
|
2031
1962
|
|
|
2032
|
-
columns_for_system_record_id = sorted(list(search_keys.keys()) +
|
|
1963
|
+
columns_for_system_record_id = sorted(list(search_keys.keys()) + (original_features_for_transform))
|
|
2033
1964
|
|
|
2034
1965
|
df[ENTITY_SYSTEM_RECORD_ID] = pd.util.hash_pandas_object(
|
|
2035
1966
|
df[columns_for_system_record_id], index=False
|
|
2036
1967
|
).astype("Float64")
|
|
2037
1968
|
|
|
2038
1969
|
# Explode multiple search keys
|
|
2039
|
-
df, unnest_search_keys = self._explode_multiple_search_keys(df, search_keys
|
|
1970
|
+
df, unnest_search_keys = self._explode_multiple_search_keys(df, search_keys)
|
|
2040
1971
|
|
|
2041
1972
|
email_column = self._get_email_column(search_keys)
|
|
2042
1973
|
hem_column = self._get_hem_column(search_keys)
|
|
1974
|
+
email_converted_to_hem = False
|
|
2043
1975
|
if email_column:
|
|
2044
1976
|
converter = EmailSearchKeyConverter(
|
|
2045
|
-
email_column,
|
|
2046
|
-
hem_column,
|
|
2047
|
-
search_keys,
|
|
2048
|
-
columns_renaming,
|
|
2049
|
-
list(unnest_search_keys.keys()),
|
|
2050
|
-
self.logger,
|
|
1977
|
+
email_column, hem_column, search_keys, list(unnest_search_keys.keys()), self.logger
|
|
2051
1978
|
)
|
|
2052
1979
|
df = converter.convert(df)
|
|
2053
|
-
|
|
2054
|
-
|
|
2055
|
-
if ip_column:
|
|
2056
|
-
converter = IpSearchKeyConverter(
|
|
2057
|
-
ip_column,
|
|
2058
|
-
search_keys,
|
|
2059
|
-
columns_renaming,
|
|
2060
|
-
list(unnest_search_keys.keys()),
|
|
2061
|
-
self.bundle,
|
|
2062
|
-
self.logger,
|
|
2063
|
-
)
|
|
2064
|
-
df = converter.convert(df)
|
|
2065
|
-
|
|
2066
|
-
phone_column = self._get_phone_column(search_keys)
|
|
2067
|
-
country_column = self._get_country_column(search_keys)
|
|
2068
|
-
if phone_column:
|
|
2069
|
-
converter = PhoneSearchKeyConverter(phone_column, country_column)
|
|
2070
|
-
df = converter.convert(df)
|
|
2071
|
-
|
|
2072
|
-
if country_column:
|
|
2073
|
-
converter = CountrySearchKeyConverter(country_column)
|
|
2074
|
-
df = converter.convert(df)
|
|
2075
|
-
|
|
2076
|
-
postal_code = self._get_postal_column(search_keys)
|
|
2077
|
-
if postal_code:
|
|
2078
|
-
converter = PostalCodeSearchKeyConverter(postal_code)
|
|
2079
|
-
df = converter.convert(df)
|
|
2080
|
-
|
|
1980
|
+
generated_features.extend(converter.generated_features)
|
|
1981
|
+
email_converted_to_hem = converter.email_converted_to_hem
|
|
2081
1982
|
generated_features = [f for f in generated_features if f in self.fit_generated_features]
|
|
2082
1983
|
|
|
2083
1984
|
meaning_types = {col: key.value for col, key in search_keys.items()}
|
|
2084
|
-
for
|
|
1985
|
+
# non_keys_columns = [column for column in df.columns if column not in search_keys.keys()]
|
|
1986
|
+
for col in original_features_for_transform:
|
|
2085
1987
|
meaning_types[col] = FileColumnMeaningType.FEATURE
|
|
2086
|
-
features_not_to_pass = [
|
|
2087
|
-
|
|
2088
|
-
|
|
2089
|
-
|
|
2090
|
-
|
|
1988
|
+
features_not_to_pass = [column for column in features_not_to_pass if column not in search_keys.keys()]
|
|
1989
|
+
|
|
1990
|
+
if email_converted_to_hem:
|
|
1991
|
+
features_not_to_pass.append(email_column)
|
|
1992
|
+
|
|
1993
|
+
features_not_to_pass = [c for c in features_not_to_pass if c not in original_features_for_transform]
|
|
1994
|
+
columns_for_system_record_id = sorted(list(search_keys.keys()) + (original_features_for_transform))
|
|
2091
1995
|
|
|
2092
1996
|
if add_fit_system_record_id:
|
|
2093
|
-
df = self.__add_fit_system_record_id(df, search_keys, SYSTEM_RECORD_ID)
|
|
2094
|
-
if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
|
|
2095
|
-
df = df.drop(columns=DateTimeSearchKeyConverter.DATETIME_COL)
|
|
1997
|
+
df = self.__add_fit_system_record_id(df, {}, search_keys, SYSTEM_RECORD_ID)
|
|
2096
1998
|
df = df.rename(columns={SYSTEM_RECORD_ID: SORT_ID})
|
|
2097
1999
|
features_not_to_pass.append(SORT_ID)
|
|
2098
2000
|
|
|
2099
|
-
|
|
2100
|
-
|
|
2001
|
+
columns_for_system_record_id = sorted(list(search_keys.keys()) + (original_features_for_transform or []))
|
|
2002
|
+
|
|
2101
2003
|
df[SYSTEM_RECORD_ID] = pd.util.hash_pandas_object(df[columns_for_system_record_id], index=False).astype(
|
|
2102
2004
|
"Float64"
|
|
2103
2005
|
)
|
|
@@ -2133,7 +2035,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2133
2035
|
rest_client=self.rest_client,
|
|
2134
2036
|
logger=self.logger,
|
|
2135
2037
|
)
|
|
2136
|
-
|
|
2038
|
+
if email_converted_to_hem:
|
|
2039
|
+
dataset.ignore_columns = [email_column]
|
|
2137
2040
|
|
|
2138
2041
|
if max_features is not None or importance_threshold is not None:
|
|
2139
2042
|
exclude_features_sources = list(
|
|
@@ -2222,9 +2125,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2222
2125
|
result = enrich()
|
|
2223
2126
|
|
|
2224
2127
|
filtered_columns = self.__filtered_enriched_features(importance_threshold, max_features)
|
|
2225
|
-
existing_filtered_columns = [
|
|
2226
|
-
c for c in filtered_columns if c in result.columns and c not in validated_X.columns
|
|
2227
|
-
]
|
|
2128
|
+
existing_filtered_columns = [c for c in filtered_columns if c in result.columns]
|
|
2228
2129
|
selecting_columns = validated_X.columns.tolist() + generated_features + existing_filtered_columns
|
|
2229
2130
|
if add_fit_system_record_id:
|
|
2230
2131
|
selecting_columns.append(SORT_ID)
|
|
@@ -2237,7 +2138,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2237
2138
|
if add_fit_system_record_id:
|
|
2238
2139
|
result = result.rename(columns={SORT_ID: SYSTEM_RECORD_ID})
|
|
2239
2140
|
|
|
2240
|
-
return result
|
|
2141
|
+
return result
|
|
2241
2142
|
|
|
2242
2143
|
def _get_excluded_features(self, max_features: Optional[int], importance_threshold: Optional[float]) -> List[str]:
|
|
2243
2144
|
features_info = self._internal_features_info
|
|
@@ -2338,9 +2239,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2338
2239
|
self.df_with_original_index = None
|
|
2339
2240
|
self.__cached_sampled_datasets = None
|
|
2340
2241
|
self.metrics = None
|
|
2341
|
-
self.fit_columns_renaming = None
|
|
2342
|
-
self.fit_dropped_features = set()
|
|
2343
|
-
self.fit_generated_features = []
|
|
2344
2242
|
|
|
2345
2243
|
validated_X = self._validate_X(X)
|
|
2346
2244
|
validated_y = self._validate_y(validated_X, y)
|
|
@@ -2387,10 +2285,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2387
2285
|
self.fit_search_keys = self.search_keys.copy()
|
|
2388
2286
|
self.fit_search_keys = self.__prepare_search_keys(validated_X, self.fit_search_keys, is_demo_dataset)
|
|
2389
2287
|
|
|
2390
|
-
maybe_date_column =
|
|
2288
|
+
maybe_date_column = self._get_date_column(self.fit_search_keys)
|
|
2391
2289
|
has_date = maybe_date_column is not None
|
|
2392
2290
|
model_task_type = self.model_task_type or define_task(validated_y, has_date, self.logger)
|
|
2393
|
-
|
|
2394
2291
|
self._validate_binary_observations(validated_y, model_task_type)
|
|
2395
2292
|
|
|
2396
2293
|
self.runtime_parameters = get_runtime_params_custom_loss(
|
|
@@ -2420,13 +2317,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2420
2317
|
self.fit_generated_features = []
|
|
2421
2318
|
|
|
2422
2319
|
if has_date:
|
|
2423
|
-
converter = DateTimeSearchKeyConverter(
|
|
2424
|
-
maybe_date_column,
|
|
2425
|
-
self.date_format,
|
|
2426
|
-
self.logger,
|
|
2427
|
-
bundle=self.bundle,
|
|
2428
|
-
warnings_counter=self.warning_counter,
|
|
2429
|
-
)
|
|
2320
|
+
converter = DateTimeSearchKeyConverter(maybe_date_column, self.date_format, self.logger, bundle=self.bundle)
|
|
2430
2321
|
df = converter.convert(df, keep_time=True)
|
|
2431
2322
|
self.logger.info(f"Date column after convertion: {df[maybe_date_column]}")
|
|
2432
2323
|
self.fit_generated_features.extend(converter.generated_features)
|
|
@@ -2435,12 +2326,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2435
2326
|
if self.add_date_if_missing:
|
|
2436
2327
|
df = self._add_current_date_as_key(df, self.fit_search_keys, self.logger, self.bundle)
|
|
2437
2328
|
|
|
2438
|
-
email_columns = SearchKey.find_all_keys(self.fit_search_keys, SearchKey.EMAIL)
|
|
2439
|
-
if email_columns:
|
|
2440
|
-
generator = EmailDomainGenerator(email_columns)
|
|
2441
|
-
df = generator.generate(df)
|
|
2442
|
-
self.fit_generated_features.extend(generator.generated_features)
|
|
2443
|
-
|
|
2444
2329
|
# Checks that need validated date
|
|
2445
2330
|
validate_dates_distribution(df, self.fit_search_keys, self.logger, self.bundle, self.warning_counter)
|
|
2446
2331
|
|
|
@@ -2449,12 +2334,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2449
2334
|
|
|
2450
2335
|
self.__adjust_cv(df, maybe_date_column, model_task_type)
|
|
2451
2336
|
|
|
2452
|
-
|
|
2453
|
-
self.fit_search_keys, self.fit_generated_features, self.bundle, self.logger, self.warning_counter
|
|
2454
|
-
)
|
|
2455
|
-
df = normalizer.normalize(df)
|
|
2456
|
-
columns_renaming = normalizer.columns_renaming
|
|
2457
|
-
self.fit_columns_renaming = columns_renaming
|
|
2337
|
+
# TODO normalize and convert all columns
|
|
2458
2338
|
|
|
2459
2339
|
df = remove_fintech_duplicates(
|
|
2460
2340
|
df, self.fit_search_keys, date_format=self.date_format, logger=self.logger, bundle=self.bundle
|
|
@@ -2462,58 +2342,38 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2462
2342
|
df = clean_full_duplicates(df, self.logger, bundle=self.bundle)
|
|
2463
2343
|
|
|
2464
2344
|
# Explode multiple search keys
|
|
2465
|
-
|
|
2345
|
+
non_feature_columns = [self.TARGET_NAME, EVAL_SET_INDEX] + list(self.fit_search_keys.keys())
|
|
2346
|
+
meaning_types = {
|
|
2347
|
+
**{col: key.value for col, key in self.fit_search_keys.items()},
|
|
2348
|
+
**{str(c): FileColumnMeaningType.FEATURE for c in df.columns if c not in non_feature_columns},
|
|
2349
|
+
}
|
|
2350
|
+
meaning_types[self.TARGET_NAME] = FileColumnMeaningType.TARGET
|
|
2351
|
+
if eval_set is not None and len(eval_set) > 0:
|
|
2352
|
+
meaning_types[EVAL_SET_INDEX] = FileColumnMeaningType.EVAL_SET_INDEX
|
|
2353
|
+
df = self.__add_fit_system_record_id(df, meaning_types, self.fit_search_keys, ENTITY_SYSTEM_RECORD_ID)
|
|
2466
2354
|
|
|
2467
2355
|
# TODO check that this is correct for enrichment
|
|
2468
2356
|
self.df_with_original_index = df.copy()
|
|
2469
|
-
# TODO check maybe need to drop _time column from df_with_original_index
|
|
2470
2357
|
|
|
2471
|
-
df, unnest_search_keys = self._explode_multiple_search_keys(df, self.fit_search_keys
|
|
2358
|
+
df, unnest_search_keys = self._explode_multiple_search_keys(df, self.fit_search_keys)
|
|
2472
2359
|
|
|
2473
2360
|
# Convert EMAIL to HEM after unnesting to do it only with one column
|
|
2474
2361
|
email_column = self._get_email_column(self.fit_search_keys)
|
|
2475
2362
|
hem_column = self._get_hem_column(self.fit_search_keys)
|
|
2363
|
+
email_converted_to_hem = False
|
|
2476
2364
|
if email_column:
|
|
2477
2365
|
converter = EmailSearchKeyConverter(
|
|
2478
|
-
email_column,
|
|
2479
|
-
hem_column,
|
|
2480
|
-
self.fit_search_keys,
|
|
2481
|
-
columns_renaming,
|
|
2482
|
-
list(unnest_search_keys.keys()),
|
|
2483
|
-
self.logger,
|
|
2366
|
+
email_column, hem_column, self.fit_search_keys, list(unnest_search_keys.keys()), self.logger
|
|
2484
2367
|
)
|
|
2485
2368
|
df = converter.convert(df)
|
|
2486
|
-
|
|
2487
|
-
|
|
2488
|
-
if ip_column:
|
|
2489
|
-
converter = IpSearchKeyConverter(
|
|
2490
|
-
ip_column,
|
|
2491
|
-
self.fit_search_keys,
|
|
2492
|
-
columns_renaming,
|
|
2493
|
-
list(unnest_search_keys.keys()),
|
|
2494
|
-
self.bundle,
|
|
2495
|
-
self.logger,
|
|
2496
|
-
)
|
|
2497
|
-
df = converter.convert(df)
|
|
2498
|
-
|
|
2499
|
-
phone_column = self._get_phone_column(self.fit_search_keys)
|
|
2500
|
-
country_column = self._get_country_column(self.fit_search_keys)
|
|
2501
|
-
if phone_column:
|
|
2502
|
-
converter = PhoneSearchKeyConverter(phone_column, country_column)
|
|
2503
|
-
df = converter.convert(df)
|
|
2504
|
-
|
|
2505
|
-
if country_column:
|
|
2506
|
-
converter = CountrySearchKeyConverter(country_column)
|
|
2507
|
-
df = converter.convert(df)
|
|
2508
|
-
|
|
2509
|
-
postal_code = self._get_postal_column(self.fit_search_keys)
|
|
2510
|
-
if postal_code:
|
|
2511
|
-
converter = PostalCodeSearchKeyConverter(postal_code)
|
|
2512
|
-
df = converter.convert(df)
|
|
2369
|
+
self.fit_generated_features.extend(converter.generated_features)
|
|
2370
|
+
email_converted_to_hem = converter.email_converted_to_hem
|
|
2513
2371
|
|
|
2514
2372
|
non_feature_columns = [self.TARGET_NAME, EVAL_SET_INDEX, ENTITY_SYSTEM_RECORD_ID, SEARCH_KEY_UNNEST] + list(
|
|
2515
2373
|
self.fit_search_keys.keys()
|
|
2516
2374
|
)
|
|
2375
|
+
if email_converted_to_hem:
|
|
2376
|
+
non_feature_columns.append(email_column)
|
|
2517
2377
|
if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
|
|
2518
2378
|
non_feature_columns.append(DateTimeSearchKeyConverter.DATETIME_COL)
|
|
2519
2379
|
|
|
@@ -2525,6 +2385,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2525
2385
|
self.fit_dropped_features.update(features_to_drop)
|
|
2526
2386
|
df = df.drop(columns=features_to_drop)
|
|
2527
2387
|
|
|
2388
|
+
if email_converted_to_hem:
|
|
2389
|
+
self.fit_dropped_features.add(email_column)
|
|
2390
|
+
|
|
2528
2391
|
self.fit_generated_features = [f for f in self.fit_generated_features if f not in self.fit_dropped_features]
|
|
2529
2392
|
|
|
2530
2393
|
meaning_types = {
|
|
@@ -2538,12 +2401,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2538
2401
|
if eval_set is not None and len(eval_set) > 0:
|
|
2539
2402
|
meaning_types[EVAL_SET_INDEX] = FileColumnMeaningType.EVAL_SET_INDEX
|
|
2540
2403
|
|
|
2541
|
-
df = self.__add_fit_system_record_id(df, self.fit_search_keys, SYSTEM_RECORD_ID)
|
|
2542
|
-
|
|
2543
|
-
if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
|
|
2544
|
-
df = df.drop(columns=DateTimeSearchKeyConverter.DATETIME_COL)
|
|
2545
|
-
|
|
2546
|
-
meaning_types[SYSTEM_RECORD_ID] = FileColumnMeaningType.SYSTEM_RECORD_ID
|
|
2404
|
+
df = self.__add_fit_system_record_id(df, meaning_types, self.fit_search_keys, SYSTEM_RECORD_ID)
|
|
2547
2405
|
|
|
2548
2406
|
df = df.reset_index(drop=True).sort_values(by=SYSTEM_RECORD_ID).reset_index(drop=True)
|
|
2549
2407
|
|
|
@@ -2561,7 +2419,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2561
2419
|
rest_client=self.rest_client,
|
|
2562
2420
|
logger=self.logger,
|
|
2563
2421
|
)
|
|
2564
|
-
|
|
2422
|
+
if email_converted_to_hem:
|
|
2423
|
+
dataset.ignore_columns = [email_column]
|
|
2565
2424
|
|
|
2566
2425
|
self.passed_features = [
|
|
2567
2426
|
column for column, meaning_type in meaning_types.items() if meaning_type == FileColumnMeaningType.FEATURE
|
|
@@ -2950,7 +2809,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2950
2809
|
if DateTimeSearchKeyConverter.DATETIME_COL in X.columns:
|
|
2951
2810
|
date_column = DateTimeSearchKeyConverter.DATETIME_COL
|
|
2952
2811
|
else:
|
|
2953
|
-
date_column =
|
|
2812
|
+
date_column = FeaturesEnricher._get_date_column(search_keys)
|
|
2954
2813
|
sort_columns = [date_column] if date_column is not None else []
|
|
2955
2814
|
|
|
2956
2815
|
# Xy = pd.concat([X, y], axis=1)
|
|
@@ -3046,10 +2905,10 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3046
2905
|
|
|
3047
2906
|
do_without_pandas_limits(print_datasets_sample)
|
|
3048
2907
|
|
|
3049
|
-
maybe_date_col =
|
|
2908
|
+
maybe_date_col = self._get_date_column(self.search_keys)
|
|
3050
2909
|
if X is not None and maybe_date_col is not None and maybe_date_col in X.columns:
|
|
3051
2910
|
# TODO cast date column to single dtype
|
|
3052
|
-
date_converter = DateTimeSearchKeyConverter(maybe_date_col, self.date_format
|
|
2911
|
+
date_converter = DateTimeSearchKeyConverter(maybe_date_col, self.date_format)
|
|
3053
2912
|
converted_X = date_converter.convert(X)
|
|
3054
2913
|
min_date = converted_X[maybe_date_col].min()
|
|
3055
2914
|
max_date = converted_X[maybe_date_col].max()
|
|
@@ -3076,6 +2935,12 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3076
2935
|
|
|
3077
2936
|
return df
|
|
3078
2937
|
|
|
2938
|
+
@staticmethod
|
|
2939
|
+
def _get_date_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
|
|
2940
|
+
for col, t in search_keys.items():
|
|
2941
|
+
if t in [SearchKey.DATE, SearchKey.DATETIME]:
|
|
2942
|
+
return col
|
|
2943
|
+
|
|
3079
2944
|
@staticmethod
|
|
3080
2945
|
def _add_current_date_as_key(
|
|
3081
2946
|
df: pd.DataFrame, search_keys: Dict[str, SearchKey], logger: logging.Logger, bundle: ResourceBundle
|
|
@@ -3091,7 +2956,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3091
2956
|
logger.warning(msg)
|
|
3092
2957
|
df[FeaturesEnricher.CURRENT_DATE] = datetime.date.today()
|
|
3093
2958
|
search_keys[FeaturesEnricher.CURRENT_DATE] = SearchKey.DATE
|
|
3094
|
-
converter = DateTimeSearchKeyConverter(FeaturesEnricher.CURRENT_DATE,
|
|
2959
|
+
converter = DateTimeSearchKeyConverter(FeaturesEnricher.CURRENT_DATE, None, logger, bundle)
|
|
3095
2960
|
df = converter.convert(df)
|
|
3096
2961
|
return df
|
|
3097
2962
|
|
|
@@ -3119,37 +2984,17 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3119
2984
|
if len(cols) == 1:
|
|
3120
2985
|
return cols[0]
|
|
3121
2986
|
|
|
3122
|
-
@staticmethod
|
|
3123
|
-
def _get_ip_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
|
|
3124
|
-
cols = [col for col, t in search_keys.items() if t == SearchKey.IP]
|
|
3125
|
-
if len(cols) > 1:
|
|
3126
|
-
raise Exception("More than one ip column found after unnest")
|
|
3127
|
-
if len(cols) == 1:
|
|
3128
|
-
return cols[0]
|
|
3129
|
-
|
|
3130
2987
|
@staticmethod
|
|
3131
2988
|
def _get_phone_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
|
|
3132
2989
|
for col, t in search_keys.items():
|
|
3133
2990
|
if t == SearchKey.PHONE:
|
|
3134
2991
|
return col
|
|
3135
2992
|
|
|
3136
|
-
@staticmethod
|
|
3137
|
-
def _get_country_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
|
|
3138
|
-
for col, t in search_keys.items():
|
|
3139
|
-
if t == SearchKey.COUNTRY:
|
|
3140
|
-
return col
|
|
3141
|
-
|
|
3142
|
-
@staticmethod
|
|
3143
|
-
def _get_postal_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
|
|
3144
|
-
for col, t in search_keys.items():
|
|
3145
|
-
if t == SearchKey.POSTAL_CODE:
|
|
3146
|
-
return col
|
|
3147
|
-
|
|
3148
2993
|
def _explode_multiple_search_keys(
|
|
3149
|
-
self, df: pd.DataFrame, search_keys: Dict[str, SearchKey]
|
|
2994
|
+
self, df: pd.DataFrame, search_keys: Dict[str, SearchKey]
|
|
3150
2995
|
) -> Tuple[pd.DataFrame, Dict[str, List[str]]]:
|
|
3151
2996
|
# find groups of multiple search keys
|
|
3152
|
-
search_key_names_by_type: Dict[SearchKey,
|
|
2997
|
+
search_key_names_by_type: Dict[SearchKey, str] = {}
|
|
3153
2998
|
for key_name, key_type in search_keys.items():
|
|
3154
2999
|
search_key_names_by_type[key_type] = search_key_names_by_type.get(key_type, []) + [key_name]
|
|
3155
3000
|
search_key_names_by_type = {
|
|
@@ -3173,7 +3018,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3173
3018
|
del search_keys[old_key]
|
|
3174
3019
|
search_keys[new_search_key] = key_type
|
|
3175
3020
|
unnest_search_keys[new_search_key] = key_names
|
|
3176
|
-
columns_renaming[new_search_key] = new_search_key
|
|
3177
3021
|
|
|
3178
3022
|
df = pd.concat(exploded_dfs, ignore_index=True)
|
|
3179
3023
|
return df, unnest_search_keys
|
|
@@ -3181,7 +3025,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3181
3025
|
def __add_fit_system_record_id(
|
|
3182
3026
|
self,
|
|
3183
3027
|
df: pd.DataFrame,
|
|
3184
|
-
|
|
3028
|
+
meaning_types: Dict[str, FileColumnMeaningType],
|
|
3185
3029
|
search_keys: Dict[str, SearchKey],
|
|
3186
3030
|
id_name: str,
|
|
3187
3031
|
) -> pd.DataFrame:
|
|
@@ -3204,9 +3048,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3204
3048
|
]
|
|
3205
3049
|
if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
|
|
3206
3050
|
date_column = DateTimeSearchKeyConverter.DATETIME_COL
|
|
3207
|
-
sort_exclude_columns.append(
|
|
3051
|
+
sort_exclude_columns.append(self._get_date_column(search_keys))
|
|
3208
3052
|
else:
|
|
3209
|
-
date_column =
|
|
3053
|
+
date_column = self._get_date_column(search_keys)
|
|
3210
3054
|
sort_columns = [date_column] if date_column is not None else []
|
|
3211
3055
|
|
|
3212
3056
|
other_columns = sorted(
|
|
@@ -3215,6 +3059,13 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3215
3059
|
for c in df.columns
|
|
3216
3060
|
if c not in sort_columns and c not in sort_exclude_columns and df[c].nunique() > 1
|
|
3217
3061
|
]
|
|
3062
|
+
# [
|
|
3063
|
+
# sk
|
|
3064
|
+
# for sk, key_type in search_keys.items()
|
|
3065
|
+
# if key_type not in [SearchKey.DATE, SearchKey.DATETIME]
|
|
3066
|
+
# and sk in df.columns
|
|
3067
|
+
# and df[sk].nunique() > 1 # don't use constant keys for hash
|
|
3068
|
+
# ]
|
|
3218
3069
|
)
|
|
3219
3070
|
|
|
3220
3071
|
search_keys_hash = "search_keys_hash"
|
|
@@ -3227,6 +3078,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3227
3078
|
if search_keys_hash in df.columns:
|
|
3228
3079
|
df.drop(columns=search_keys_hash, inplace=True)
|
|
3229
3080
|
|
|
3081
|
+
if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
|
|
3082
|
+
df.drop(columns=DateTimeSearchKeyConverter.DATETIME_COL, inplace=True)
|
|
3083
|
+
|
|
3230
3084
|
df = df.reset_index(drop=True).reset_index()
|
|
3231
3085
|
# system_record_id saves correct order for fit
|
|
3232
3086
|
df = df.rename(columns={DEFAULT_INDEX: id_name})
|
|
@@ -3236,11 +3090,11 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3236
3090
|
df.index.name = original_index_name
|
|
3237
3091
|
df = df.sort_values(by=original_order_name).drop(columns=original_order_name)
|
|
3238
3092
|
|
|
3239
|
-
|
|
3240
|
-
|
|
3241
|
-
|
|
3242
|
-
|
|
3243
|
-
|
|
3093
|
+
meaning_types[id_name] = (
|
|
3094
|
+
FileColumnMeaningType.SYSTEM_RECORD_ID
|
|
3095
|
+
if id_name == SYSTEM_RECORD_ID
|
|
3096
|
+
else FileColumnMeaningType.ENTITY_SYSTEM_RECORD_ID
|
|
3097
|
+
)
|
|
3244
3098
|
return df
|
|
3245
3099
|
|
|
3246
3100
|
def __correct_target(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
@@ -3618,13 +3472,10 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3618
3472
|
for _, key_type in search_keys.items():
|
|
3619
3473
|
if not isinstance(key_type, SearchKey):
|
|
3620
3474
|
raise ValidationError(self.bundle.get("unsupported_type_of_search_key").format(key_type))
|
|
3621
|
-
|
|
3622
3475
|
valid_search_keys = {}
|
|
3623
3476
|
unsupported_search_keys = {
|
|
3624
3477
|
SearchKey.IP_RANGE_FROM,
|
|
3625
3478
|
SearchKey.IP_RANGE_TO,
|
|
3626
|
-
SearchKey.IPV6_RANGE_FROM,
|
|
3627
|
-
SearchKey.IPV6_RANGE_TO,
|
|
3628
3479
|
SearchKey.MSISDN_RANGE_FROM,
|
|
3629
3480
|
SearchKey.MSISDN_RANGE_TO,
|
|
3630
3481
|
# SearchKey.EMAIL_ONE_DOMAIN,
|
|
@@ -3714,7 +3565,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3714
3565
|
print(msg)
|
|
3715
3566
|
self.logger.warning(msg)
|
|
3716
3567
|
self.warning_counter.increment()
|
|
3717
|
-
# TODO maybe raise ValidationError
|
|
3718
3568
|
|
|
3719
3569
|
self.logger.info(f"Prepared search keys: {valid_search_keys}")
|
|
3720
3570
|
|