upgini 1.1.315a3579.dev1__py3-none-any.whl → 1.1.316__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- upgini/__about__.py +1 -1
- upgini/autofe/binary.py +4 -1
- upgini/data_source/data_source_publisher.py +9 -0
- upgini/dataset.py +34 -387
- upgini/features_enricher.py +335 -167
- upgini/metadata.py +16 -1
- upgini/normalizer/normalize_utils.py +203 -0
- upgini/utils/country_utils.py +16 -0
- upgini/utils/datetime_utils.py +37 -16
- upgini/utils/email_utils.py +49 -17
- upgini/utils/ip_utils.py +100 -1
- upgini/utils/phone_utils.py +345 -0
- upgini/utils/postal_code_utils.py +34 -0
- {upgini-1.1.315a3579.dev1.dist-info → upgini-1.1.316.dist-info}/METADATA +1 -1
- {upgini-1.1.315a3579.dev1.dist-info → upgini-1.1.316.dist-info}/RECORD +17 -17
- {upgini-1.1.315a3579.dev1.dist-info → upgini-1.1.316.dist-info}/WHEEL +1 -1
- upgini/normalizer/phone_normalizer.py +0 -340
- {upgini-1.1.315a3579.dev1.dist-info → upgini-1.1.316.dist-info}/licenses/LICENSE +0 -0
upgini/features_enricher.py
CHANGED
|
@@ -61,11 +61,15 @@ from upgini.metadata import (
|
|
|
61
61
|
SearchKey,
|
|
62
62
|
)
|
|
63
63
|
from upgini.metrics import EstimatorWrapper, validate_scoring_argument
|
|
64
|
+
from upgini.normalizer.normalize_utils import Normalizer
|
|
64
65
|
from upgini.resource_bundle import ResourceBundle, bundle, get_custom_bundle
|
|
65
66
|
from upgini.search_task import SearchTask
|
|
66
67
|
from upgini.spinner import Spinner
|
|
67
68
|
from upgini.utils import combine_search_keys, find_numbers_with_decimal_comma
|
|
68
|
-
from upgini.utils.country_utils import
|
|
69
|
+
from upgini.utils.country_utils import (
|
|
70
|
+
CountrySearchKeyConverter,
|
|
71
|
+
CountrySearchKeyDetector,
|
|
72
|
+
)
|
|
69
73
|
from upgini.utils.custom_loss_utils import (
|
|
70
74
|
get_additional_params_custom_loss,
|
|
71
75
|
get_runtime_params_custom_loss,
|
|
@@ -87,11 +91,19 @@ from upgini.utils.display_utils import (
|
|
|
87
91
|
prepare_and_show_report,
|
|
88
92
|
show_request_quote_button,
|
|
89
93
|
)
|
|
90
|
-
from upgini.utils.email_utils import
|
|
94
|
+
from upgini.utils.email_utils import (
|
|
95
|
+
EmailDomainGenerator,
|
|
96
|
+
EmailSearchKeyConverter,
|
|
97
|
+
EmailSearchKeyDetector,
|
|
98
|
+
)
|
|
91
99
|
from upgini.utils.features_validator import FeaturesValidator
|
|
92
100
|
from upgini.utils.format import Format
|
|
93
|
-
from upgini.utils.
|
|
94
|
-
from upgini.utils.
|
|
101
|
+
from upgini.utils.ip_utils import IpSearchKeyConverter
|
|
102
|
+
from upgini.utils.phone_utils import PhoneSearchKeyConverter, PhoneSearchKeyDetector
|
|
103
|
+
from upgini.utils.postal_code_utils import (
|
|
104
|
+
PostalCodeSearchKeyConverter,
|
|
105
|
+
PostalCodeSearchKeyDetector,
|
|
106
|
+
)
|
|
95
107
|
|
|
96
108
|
try:
|
|
97
109
|
from upgini.utils.progress_bar import CustomProgressBar as ProgressBar
|
|
@@ -237,6 +249,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
237
249
|
|
|
238
250
|
self.passed_features: List[str] = []
|
|
239
251
|
self.df_with_original_index: Optional[pd.DataFrame] = None
|
|
252
|
+
self.fit_columns_renaming: Optional[Dict[str, str]] = None
|
|
240
253
|
self.country_added = False
|
|
241
254
|
self.fit_generated_features: List[str] = []
|
|
242
255
|
self.fit_dropped_features: Set[str] = set()
|
|
@@ -247,7 +260,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
247
260
|
self.eval_set: Optional[List[Tuple]] = None
|
|
248
261
|
self.autodetected_search_keys: Dict[str, SearchKey] = {}
|
|
249
262
|
self.imbalanced = False
|
|
250
|
-
self.__cached_sampled_datasets: Optional[Tuple[pd.DataFrame, pd.DataFrame, pd.Series, Dict, Dict]] = None
|
|
263
|
+
self.__cached_sampled_datasets: Optional[Tuple[pd.DataFrame, pd.DataFrame, pd.Series, Dict, Dict, Dict]] = None
|
|
251
264
|
|
|
252
265
|
validate_version(self.logger)
|
|
253
266
|
self.search_keys = search_keys or {}
|
|
@@ -706,7 +719,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
706
719
|
|
|
707
720
|
start_time = time.time()
|
|
708
721
|
try:
|
|
709
|
-
result = self.__inner_transform(
|
|
722
|
+
result, _ = self.__inner_transform(
|
|
710
723
|
trace_id,
|
|
711
724
|
X,
|
|
712
725
|
exclude_features_sources=exclude_features_sources,
|
|
@@ -833,17 +846,37 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
833
846
|
self.logger.warning(msg)
|
|
834
847
|
print(msg)
|
|
835
848
|
|
|
849
|
+
if X is not None and y is None:
|
|
850
|
+
raise ValidationError("X passed without y")
|
|
851
|
+
|
|
836
852
|
self.__validate_search_keys(self.search_keys, self.search_id)
|
|
837
853
|
effective_X = X if X is not None else self.X
|
|
838
854
|
effective_y = y if y is not None else self.y
|
|
839
855
|
effective_eval_set = eval_set if eval_set is not None else self.eval_set
|
|
840
856
|
effective_eval_set = self._check_eval_set(effective_eval_set, effective_X, self.bundle)
|
|
841
857
|
|
|
858
|
+
if (
|
|
859
|
+
self._search_task is None
|
|
860
|
+
or self._search_task.provider_metadata_v2 is None
|
|
861
|
+
or len(self._search_task.provider_metadata_v2) == 0
|
|
862
|
+
or effective_X is None
|
|
863
|
+
or effective_y is None
|
|
864
|
+
):
|
|
865
|
+
raise ValidationError(self.bundle.get("metrics_unfitted_enricher"))
|
|
866
|
+
|
|
867
|
+
validated_X = self._validate_X(effective_X)
|
|
868
|
+
validated_y = self._validate_y(validated_X, effective_y)
|
|
869
|
+
validated_eval_set = (
|
|
870
|
+
[self._validate_eval_set_pair(validated_X, eval_pair) for eval_pair in effective_eval_set]
|
|
871
|
+
if effective_eval_set is not None
|
|
872
|
+
else None
|
|
873
|
+
)
|
|
874
|
+
|
|
842
875
|
try:
|
|
843
876
|
self.__log_debug_information(
|
|
844
|
-
|
|
845
|
-
|
|
846
|
-
|
|
877
|
+
validated_X,
|
|
878
|
+
validated_y,
|
|
879
|
+
validated_eval_set,
|
|
847
880
|
exclude_features_sources=exclude_features_sources,
|
|
848
881
|
cv=cv if cv is not None else self.cv,
|
|
849
882
|
importance_threshold=importance_threshold,
|
|
@@ -853,21 +886,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
853
886
|
remove_outliers_calc_metrics=remove_outliers_calc_metrics,
|
|
854
887
|
)
|
|
855
888
|
|
|
856
|
-
if (
|
|
857
|
-
self._search_task is None
|
|
858
|
-
or self._search_task.provider_metadata_v2 is None
|
|
859
|
-
or len(self._search_task.provider_metadata_v2) == 0
|
|
860
|
-
or effective_X is None
|
|
861
|
-
or effective_y is None
|
|
862
|
-
):
|
|
863
|
-
raise ValidationError(self.bundle.get("metrics_unfitted_enricher"))
|
|
864
|
-
|
|
865
|
-
if X is not None and y is None:
|
|
866
|
-
raise ValidationError("X passed without y")
|
|
867
|
-
|
|
868
889
|
validate_scoring_argument(scoring)
|
|
869
890
|
|
|
870
|
-
self._validate_baseline_score(
|
|
891
|
+
self._validate_baseline_score(validated_X, validated_eval_set)
|
|
871
892
|
|
|
872
893
|
if self._has_paid_features(exclude_features_sources):
|
|
873
894
|
msg = self.bundle.get("metrics_with_paid_features")
|
|
@@ -876,7 +897,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
876
897
|
return None
|
|
877
898
|
|
|
878
899
|
cat_features, search_keys_for_metrics = self._get_client_cat_features(
|
|
879
|
-
estimator,
|
|
900
|
+
estimator, validated_X, self.search_keys
|
|
880
901
|
)
|
|
881
902
|
|
|
882
903
|
prepared_data = self._prepare_data_for_metrics(
|
|
@@ -906,8 +927,16 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
906
927
|
search_keys,
|
|
907
928
|
groups,
|
|
908
929
|
_cv,
|
|
930
|
+
columns_renaming,
|
|
909
931
|
) = prepared_data
|
|
910
932
|
|
|
933
|
+
# rename cat_features
|
|
934
|
+
if cat_features:
|
|
935
|
+
for new_c, old_c in columns_renaming.items():
|
|
936
|
+
if old_c in cat_features:
|
|
937
|
+
cat_features.remove(old_c)
|
|
938
|
+
cat_features.append(new_c)
|
|
939
|
+
|
|
911
940
|
gc.collect()
|
|
912
941
|
|
|
913
942
|
print(self.bundle.get("metrics_start"))
|
|
@@ -920,7 +949,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
920
949
|
|
|
921
950
|
self._check_train_and_eval_target_distribution(y_sorted, fitting_eval_set_dict)
|
|
922
951
|
|
|
923
|
-
has_date =
|
|
952
|
+
has_date = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME]) is not None
|
|
924
953
|
model_task_type = self.model_task_type or define_task(y_sorted, has_date, self.logger, silent=True)
|
|
925
954
|
|
|
926
955
|
wrapper = EstimatorWrapper.create(
|
|
@@ -1013,10 +1042,10 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1013
1042
|
self.bundle.get("quality_metrics_rows_header"): _num_samples(effective_X),
|
|
1014
1043
|
}
|
|
1015
1044
|
if model_task_type in [ModelTaskType.BINARY, ModelTaskType.REGRESSION] and is_numeric_dtype(
|
|
1016
|
-
|
|
1045
|
+
validated_y
|
|
1017
1046
|
):
|
|
1018
1047
|
train_metrics[self.bundle.get("quality_metrics_mean_target_header")] = round(
|
|
1019
|
-
np.mean(
|
|
1048
|
+
np.mean(validated_y), 4
|
|
1020
1049
|
)
|
|
1021
1050
|
if etalon_metric is not None:
|
|
1022
1051
|
train_metrics[self.bundle.get("quality_metrics_baseline_header").format(metric)] = etalon_metric
|
|
@@ -1086,10 +1115,10 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1086
1115
|
# self.bundle.get("quality_metrics_match_rate_header"): eval_hit_rate,
|
|
1087
1116
|
}
|
|
1088
1117
|
if model_task_type in [ModelTaskType.BINARY, ModelTaskType.REGRESSION] and is_numeric_dtype(
|
|
1089
|
-
|
|
1118
|
+
validated_eval_set[idx][1]
|
|
1090
1119
|
):
|
|
1091
1120
|
eval_metrics[self.bundle.get("quality_metrics_mean_target_header")] = round(
|
|
1092
|
-
np.mean(
|
|
1121
|
+
np.mean(validated_eval_set[idx][1]), 4
|
|
1093
1122
|
)
|
|
1094
1123
|
if etalon_eval_metric is not None:
|
|
1095
1124
|
eval_metrics[self.bundle.get("quality_metrics_baseline_header").format(metric)] = (
|
|
@@ -1113,7 +1142,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1113
1142
|
)
|
|
1114
1143
|
|
|
1115
1144
|
uplift_col = self.bundle.get("quality_metrics_uplift_header")
|
|
1116
|
-
date_column =
|
|
1145
|
+
date_column = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
|
|
1117
1146
|
if (
|
|
1118
1147
|
uplift_col in metrics_df.columns
|
|
1119
1148
|
and (metrics_df[uplift_col] < 0).any()
|
|
@@ -1195,27 +1224,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1195
1224
|
def _has_paid_features(self, exclude_features_sources: Optional[List[str]]) -> bool:
|
|
1196
1225
|
return self._has_features_with_commercial_schema(CommercialSchema.PAID.value, exclude_features_sources)
|
|
1197
1226
|
|
|
1198
|
-
def _extend_x(self, x: pd.DataFrame, is_demo_dataset: bool) -> Tuple[pd.DataFrame, Dict[str, SearchKey]]:
|
|
1199
|
-
search_keys = self.search_keys.copy()
|
|
1200
|
-
search_keys = self.__prepare_search_keys(x, search_keys, is_demo_dataset, is_transform=True, silent_mode=True)
|
|
1201
|
-
|
|
1202
|
-
extended_X = x.copy()
|
|
1203
|
-
generated_features = []
|
|
1204
|
-
date_column = self._get_date_column(search_keys)
|
|
1205
|
-
if date_column is not None:
|
|
1206
|
-
converter = DateTimeSearchKeyConverter(date_column, self.date_format, self.logger, self.bundle)
|
|
1207
|
-
extended_X = converter.convert(extended_X, keep_time=True)
|
|
1208
|
-
generated_features.extend(converter.generated_features)
|
|
1209
|
-
email_column = self._get_email_column(search_keys)
|
|
1210
|
-
hem_column = self._get_hem_column(search_keys)
|
|
1211
|
-
if email_column:
|
|
1212
|
-
converter = EmailSearchKeyConverter(email_column, hem_column, search_keys, [], self.logger)
|
|
1213
|
-
extended_X = converter.convert(extended_X)
|
|
1214
|
-
generated_features.extend(converter.generated_features)
|
|
1215
|
-
generated_features = [f for f in generated_features if f in self.fit_generated_features]
|
|
1216
|
-
|
|
1217
|
-
return extended_X, search_keys
|
|
1218
|
-
|
|
1219
1227
|
def _is_input_same_as_fit(
|
|
1220
1228
|
self,
|
|
1221
1229
|
X: Union[pd.DataFrame, pd.Series, np.ndarray, None] = None,
|
|
@@ -1259,7 +1267,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1259
1267
|
groups = None
|
|
1260
1268
|
|
|
1261
1269
|
if not isinstance(_cv, BaseCrossValidator):
|
|
1262
|
-
date_column =
|
|
1270
|
+
date_column = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
|
|
1263
1271
|
date_series = X[date_column] if date_column is not None else None
|
|
1264
1272
|
_cv, groups = CVConfig(
|
|
1265
1273
|
_cv, date_series, self.random_state, self._search_task.get_shuffle_kfold(), group_columns=group_columns
|
|
@@ -1282,7 +1290,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1282
1290
|
|
|
1283
1291
|
def _get_client_cat_features(
|
|
1284
1292
|
self, estimator: Optional[Any], X: pd.DataFrame, search_keys: Dict[str, SearchKey]
|
|
1285
|
-
) -> Optional[List[str]]:
|
|
1293
|
+
) -> Tuple[Optional[List[str]], List[str]]:
|
|
1286
1294
|
cat_features = None
|
|
1287
1295
|
search_keys_for_metrics = []
|
|
1288
1296
|
if (
|
|
@@ -1342,11 +1350,15 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1342
1350
|
progress_bar,
|
|
1343
1351
|
progress_callback,
|
|
1344
1352
|
)
|
|
1345
|
-
X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys = dataclasses.astuple(
|
|
1353
|
+
X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys, columns_renaming = dataclasses.astuple(
|
|
1354
|
+
sampled_data
|
|
1355
|
+
)
|
|
1346
1356
|
|
|
1347
1357
|
excluding_search_keys = list(search_keys.keys())
|
|
1348
1358
|
if search_keys_for_metrics is not None and len(search_keys_for_metrics) > 0:
|
|
1349
|
-
|
|
1359
|
+
for sk in excluding_search_keys:
|
|
1360
|
+
if columns_renaming.get(sk) in search_keys_for_metrics:
|
|
1361
|
+
excluding_search_keys.remove(sk)
|
|
1350
1362
|
|
|
1351
1363
|
client_features = [
|
|
1352
1364
|
c
|
|
@@ -1363,6 +1375,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1363
1375
|
importance_threshold,
|
|
1364
1376
|
max_features,
|
|
1365
1377
|
)
|
|
1378
|
+
filtered_enriched_features = [c for c in filtered_enriched_features if c not in client_features]
|
|
1366
1379
|
|
|
1367
1380
|
X_sorted, y_sorted = self._sort_by_system_record_id(X_sampled, y_sampled, self.cv)
|
|
1368
1381
|
enriched_X_sorted, enriched_y_sorted = self._sort_by_system_record_id(enriched_X, y_sampled, self.cv)
|
|
@@ -1392,6 +1405,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1392
1405
|
fitting_X = fitting_X.drop(columns=constant_columns, errors="ignore")
|
|
1393
1406
|
fitting_enriched_X = fitting_enriched_X.drop(columns=constant_columns, errors="ignore")
|
|
1394
1407
|
|
|
1408
|
+
# TODO maybe there is no more need for these convertions
|
|
1395
1409
|
# Remove datetime features
|
|
1396
1410
|
datetime_features = [
|
|
1397
1411
|
f for f in fitting_X.columns if is_datetime64_any_dtype(fitting_X[f]) or is_period_dtype(fitting_X[f])
|
|
@@ -1479,6 +1493,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1479
1493
|
search_keys,
|
|
1480
1494
|
groups,
|
|
1481
1495
|
cv,
|
|
1496
|
+
columns_renaming,
|
|
1482
1497
|
)
|
|
1483
1498
|
|
|
1484
1499
|
@dataclass
|
|
@@ -1488,6 +1503,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1488
1503
|
enriched_X: pd.DataFrame
|
|
1489
1504
|
eval_set_sampled_dict: Dict[int, Tuple[pd.DataFrame, pd.Series]]
|
|
1490
1505
|
search_keys: Dict[str, SearchKey]
|
|
1506
|
+
columns_renaming: Dict[str, str]
|
|
1491
1507
|
|
|
1492
1508
|
def _sample_data_for_metrics(
|
|
1493
1509
|
self,
|
|
@@ -1527,11 +1543,15 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1527
1543
|
)
|
|
1528
1544
|
|
|
1529
1545
|
def __get_sampled_cached_enriched(self, exclude_features_sources: Optional[List[str]]) -> _SampledDataForMetrics:
|
|
1530
|
-
X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys =
|
|
1546
|
+
X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys, columns_renaming = (
|
|
1547
|
+
self.__cached_sampled_datasets
|
|
1548
|
+
)
|
|
1531
1549
|
if exclude_features_sources:
|
|
1532
1550
|
enriched_X = enriched_X.drop(columns=exclude_features_sources, errors="ignore")
|
|
1533
1551
|
|
|
1534
|
-
return self.__mk_sampled_data_tuple(
|
|
1552
|
+
return self.__mk_sampled_data_tuple(
|
|
1553
|
+
X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys, columns_renaming
|
|
1554
|
+
)
|
|
1535
1555
|
|
|
1536
1556
|
def __sample_only_input(
|
|
1537
1557
|
self, validated_X: pd.DataFrame, validated_y: pd.Series, eval_set: Optional[List[tuple]], is_demo_dataset: bool
|
|
@@ -1549,6 +1569,28 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1549
1569
|
eval_xy[EVAL_SET_INDEX] = idx + 1
|
|
1550
1570
|
df = pd.concat([df, eval_xy])
|
|
1551
1571
|
|
|
1572
|
+
search_keys = self.search_keys.copy()
|
|
1573
|
+
search_keys = self.__prepare_search_keys(df, search_keys, is_demo_dataset, is_transform=True, silent_mode=True)
|
|
1574
|
+
|
|
1575
|
+
date_column = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
|
|
1576
|
+
generated_features = []
|
|
1577
|
+
if date_column is not None:
|
|
1578
|
+
converter = DateTimeSearchKeyConverter(
|
|
1579
|
+
date_column, self.date_format, self.logger, self.bundle, silent_mode=True
|
|
1580
|
+
)
|
|
1581
|
+
df = converter.convert(df, keep_time=True)
|
|
1582
|
+
generated_features = converter.generated_features
|
|
1583
|
+
|
|
1584
|
+
email_columns = SearchKey.find_all_keys(search_keys, SearchKey.EMAIL)
|
|
1585
|
+
if email_columns:
|
|
1586
|
+
generator = EmailDomainGenerator(email_columns)
|
|
1587
|
+
df = generator.generate(df)
|
|
1588
|
+
generated_features.extend(generator.generated_features)
|
|
1589
|
+
|
|
1590
|
+
normalizer = Normalizer(self.search_keys, generated_features, self.bundle, self.logger, self.warning_counter)
|
|
1591
|
+
df = normalizer.normalize(df)
|
|
1592
|
+
columns_renaming = normalizer.columns_renaming
|
|
1593
|
+
|
|
1552
1594
|
df = clean_full_duplicates(df, logger=self.logger, silent=True, bundle=self.bundle)
|
|
1553
1595
|
|
|
1554
1596
|
num_samples = _num_samples(df)
|
|
@@ -1561,24 +1603,34 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1561
1603
|
self.logger.info(f"Downsampling from {num_samples} to {sample_rows}")
|
|
1562
1604
|
df = df.sample(n=sample_rows, random_state=self.random_state)
|
|
1563
1605
|
|
|
1564
|
-
|
|
1565
|
-
|
|
1606
|
+
df = self.__add_fit_system_record_id(df, search_keys, SYSTEM_RECORD_ID)
|
|
1607
|
+
if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
|
|
1608
|
+
df = df.drop(columns=DateTimeSearchKeyConverter.DATETIME_COL)
|
|
1566
1609
|
|
|
1567
|
-
train_df =
|
|
1610
|
+
train_df = df.query(f"{EVAL_SET_INDEX} == 0") if eval_set is not None else df
|
|
1568
1611
|
X_sampled = train_df.drop(columns=[TARGET, EVAL_SET_INDEX], errors="ignore")
|
|
1569
1612
|
y_sampled = train_df[TARGET].copy()
|
|
1570
1613
|
enriched_X = X_sampled
|
|
1571
1614
|
|
|
1572
1615
|
if eval_set is not None:
|
|
1573
1616
|
for idx in range(len(eval_set)):
|
|
1574
|
-
eval_xy_sampled =
|
|
1617
|
+
eval_xy_sampled = df.query(f"{EVAL_SET_INDEX} == {idx + 1}")
|
|
1575
1618
|
eval_X_sampled = eval_xy_sampled.drop(columns=[TARGET, EVAL_SET_INDEX], errors="ignore")
|
|
1576
1619
|
eval_y_sampled = eval_xy_sampled[TARGET].copy()
|
|
1577
1620
|
enriched_eval_X = eval_X_sampled
|
|
1578
1621
|
eval_set_sampled_dict[idx] = (eval_X_sampled, enriched_eval_X, eval_y_sampled)
|
|
1579
|
-
self.__cached_sampled_datasets = (
|
|
1622
|
+
self.__cached_sampled_datasets = (
|
|
1623
|
+
X_sampled,
|
|
1624
|
+
y_sampled,
|
|
1625
|
+
enriched_X,
|
|
1626
|
+
eval_set_sampled_dict,
|
|
1627
|
+
search_keys,
|
|
1628
|
+
columns_renaming,
|
|
1629
|
+
)
|
|
1580
1630
|
|
|
1581
|
-
return self.__mk_sampled_data_tuple(
|
|
1631
|
+
return self.__mk_sampled_data_tuple(
|
|
1632
|
+
X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys, columns_renaming
|
|
1633
|
+
)
|
|
1582
1634
|
|
|
1583
1635
|
def __sample_balanced(
|
|
1584
1636
|
self,
|
|
@@ -1590,7 +1642,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1590
1642
|
search_keys = self.fit_search_keys
|
|
1591
1643
|
|
|
1592
1644
|
rows_to_drop = None
|
|
1593
|
-
has_date =
|
|
1645
|
+
has_date = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME]) is not None
|
|
1594
1646
|
task_type = self.model_task_type or define_task(
|
|
1595
1647
|
self.df_with_original_index[TARGET], has_date, self.logger, silent=True
|
|
1596
1648
|
)
|
|
@@ -1644,9 +1696,18 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1644
1696
|
enriched_eval_X = enriched_eval_sets[idx + 1].drop(columns=[TARGET, EVAL_SET_INDEX])
|
|
1645
1697
|
eval_set_sampled_dict[idx] = (eval_X_sampled, enriched_eval_X, eval_y_sampled)
|
|
1646
1698
|
|
|
1647
|
-
self.__cached_sampled_datasets = (
|
|
1699
|
+
self.__cached_sampled_datasets = (
|
|
1700
|
+
X_sampled,
|
|
1701
|
+
y_sampled,
|
|
1702
|
+
enriched_X,
|
|
1703
|
+
eval_set_sampled_dict,
|
|
1704
|
+
search_keys,
|
|
1705
|
+
self.fit_columns_renaming,
|
|
1706
|
+
)
|
|
1648
1707
|
|
|
1649
|
-
return self.__mk_sampled_data_tuple(
|
|
1708
|
+
return self.__mk_sampled_data_tuple(
|
|
1709
|
+
X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys, self.fit_columns_renaming
|
|
1710
|
+
)
|
|
1650
1711
|
|
|
1651
1712
|
def __sample_imbalanced(
|
|
1652
1713
|
self,
|
|
@@ -1686,7 +1747,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1686
1747
|
tmp_target_name = "__target"
|
|
1687
1748
|
df = df.rename(columns={TARGET: tmp_target_name})
|
|
1688
1749
|
|
|
1689
|
-
enriched_df = self.__inner_transform(
|
|
1750
|
+
enriched_df, columns_renaming = self.__inner_transform(
|
|
1690
1751
|
trace_id,
|
|
1691
1752
|
df,
|
|
1692
1753
|
exclude_features_sources=exclude_features_sources,
|
|
@@ -1734,7 +1795,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1734
1795
|
tmp_target_name = "__target"
|
|
1735
1796
|
df = df.rename(columns={TARGET: tmp_target_name})
|
|
1736
1797
|
|
|
1737
|
-
enriched_Xy = self.__inner_transform(
|
|
1798
|
+
enriched_Xy, columns_renaming = self.__inner_transform(
|
|
1738
1799
|
trace_id,
|
|
1739
1800
|
df,
|
|
1740
1801
|
exclude_features_sources=exclude_features_sources,
|
|
@@ -1759,9 +1820,18 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1759
1820
|
y_sampled = enriched_Xy[TARGET].copy()
|
|
1760
1821
|
enriched_X = enriched_Xy.drop(columns=TARGET)
|
|
1761
1822
|
|
|
1762
|
-
self.__cached_sampled_datasets = (
|
|
1823
|
+
self.__cached_sampled_datasets = (
|
|
1824
|
+
X_sampled,
|
|
1825
|
+
y_sampled,
|
|
1826
|
+
enriched_X,
|
|
1827
|
+
eval_set_sampled_dict,
|
|
1828
|
+
self.search_keys,
|
|
1829
|
+
columns_renaming,
|
|
1830
|
+
)
|
|
1763
1831
|
|
|
1764
|
-
return self.__mk_sampled_data_tuple(
|
|
1832
|
+
return self.__mk_sampled_data_tuple(
|
|
1833
|
+
X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, self.search_keys, columns_renaming
|
|
1834
|
+
)
|
|
1765
1835
|
|
|
1766
1836
|
def __mk_sampled_data_tuple(
|
|
1767
1837
|
self,
|
|
@@ -1770,6 +1840,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1770
1840
|
enriched_X: pd.DataFrame,
|
|
1771
1841
|
eval_set_sampled_dict: Dict,
|
|
1772
1842
|
search_keys: Dict,
|
|
1843
|
+
columns_renaming: Dict[str, str],
|
|
1773
1844
|
):
|
|
1774
1845
|
search_keys = {k: v for k, v in search_keys.items() if k in X_sampled.columns.to_list()}
|
|
1775
1846
|
return FeaturesEnricher._SampledDataForMetrics(
|
|
@@ -1778,6 +1849,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1778
1849
|
enriched_X=enriched_X,
|
|
1779
1850
|
eval_set_sampled_dict=eval_set_sampled_dict,
|
|
1780
1851
|
search_keys=search_keys,
|
|
1852
|
+
columns_renaming=columns_renaming,
|
|
1781
1853
|
)
|
|
1782
1854
|
|
|
1783
1855
|
def get_search_id(self) -> Optional[str]:
|
|
@@ -1866,7 +1938,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1866
1938
|
progress_bar: Optional[ProgressBar] = None,
|
|
1867
1939
|
progress_callback: Optional[Callable[[SearchProgress], Any]] = None,
|
|
1868
1940
|
add_fit_system_record_id: bool = False,
|
|
1869
|
-
) -> pd.DataFrame:
|
|
1941
|
+
) -> Tuple[pd.DataFrame, Dict[str, str]]:
|
|
1870
1942
|
if self._search_task is None:
|
|
1871
1943
|
raise NotFittedError(self.bundle.get("transform_unfitted_enricher"))
|
|
1872
1944
|
|
|
@@ -1879,13 +1951,13 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1879
1951
|
|
|
1880
1952
|
if len(self.feature_names_) == 0:
|
|
1881
1953
|
self.logger.warning(self.bundle.get("no_important_features_for_transform"))
|
|
1882
|
-
return X
|
|
1954
|
+
return X, {c: c for c in X.columns}
|
|
1883
1955
|
|
|
1884
1956
|
if self._has_paid_features(exclude_features_sources):
|
|
1885
1957
|
msg = self.bundle.get("transform_with_paid_features")
|
|
1886
1958
|
self.logger.warning(msg)
|
|
1887
1959
|
self.__display_support_link(msg)
|
|
1888
|
-
return None
|
|
1960
|
+
return None, {c: c for c in X.columns}
|
|
1889
1961
|
|
|
1890
1962
|
if not metrics_calculation:
|
|
1891
1963
|
transform_usage = self.rest_client.get_current_transform_usage(trace_id)
|
|
@@ -1896,7 +1968,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1896
1968
|
self.logger.warning(msg)
|
|
1897
1969
|
print(msg)
|
|
1898
1970
|
show_request_quote_button()
|
|
1899
|
-
return None
|
|
1971
|
+
return None, {c: c for c in X.columns}
|
|
1900
1972
|
else:
|
|
1901
1973
|
msg = self.bundle.get("transform_usage_info").format(
|
|
1902
1974
|
transform_usage.limit, transform_usage.transformed_rows
|
|
@@ -1934,9 +2006,11 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1934
2006
|
df = self.__add_country_code(df, search_keys)
|
|
1935
2007
|
|
|
1936
2008
|
generated_features = []
|
|
1937
|
-
date_column =
|
|
2009
|
+
date_column = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
|
|
1938
2010
|
if date_column is not None:
|
|
1939
|
-
converter = DateTimeSearchKeyConverter(
|
|
2011
|
+
converter = DateTimeSearchKeyConverter(
|
|
2012
|
+
date_column, self.date_format, self.logger, bundle=self.bundle, silent_mode=silent_mode
|
|
2013
|
+
)
|
|
1940
2014
|
df = converter.convert(df)
|
|
1941
2015
|
self.logger.info(f"Date column after convertion: {df[date_column]}")
|
|
1942
2016
|
generated_features.extend(converter.generated_features)
|
|
@@ -1945,61 +2019,93 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1945
2019
|
if self.add_date_if_missing:
|
|
1946
2020
|
df = self._add_current_date_as_key(df, search_keys, self.logger, self.bundle)
|
|
1947
2021
|
|
|
2022
|
+
email_columns = SearchKey.find_all_keys(search_keys, SearchKey.EMAIL)
|
|
2023
|
+
if email_columns:
|
|
2024
|
+
generator = EmailDomainGenerator(email_columns)
|
|
2025
|
+
df = generator.generate(df)
|
|
2026
|
+
generated_features.extend(generator.generated_features)
|
|
2027
|
+
|
|
2028
|
+
normalizer = Normalizer(
|
|
2029
|
+
search_keys, generated_features, self.bundle, self.logger, self.warning_counter, silent_mode
|
|
2030
|
+
)
|
|
2031
|
+
df = normalizer.normalize(df)
|
|
2032
|
+
columns_renaming = normalizer.columns_renaming
|
|
2033
|
+
|
|
1948
2034
|
# Don't pass all features in backend on transform
|
|
1949
|
-
original_features_for_transform = []
|
|
1950
2035
|
runtime_parameters = self._get_copy_of_runtime_parameters()
|
|
1951
|
-
|
|
1952
|
-
if len(
|
|
1953
|
-
|
|
1954
|
-
features_for_transform = self._search_task.get_features_for_transform()
|
|
1955
|
-
if features_for_transform is not None and len(features_for_transform) > 0:
|
|
1956
|
-
file_metadata = self._search_task.get_file_metadata(trace_id)
|
|
1957
|
-
original_features_for_transform = [
|
|
1958
|
-
c.originalName or c.name for c in file_metadata.columns if c.name in features_for_transform
|
|
1959
|
-
]
|
|
2036
|
+
features_for_transform = self._search_task.get_features_for_transform() or []
|
|
2037
|
+
if len(features_for_transform) > 0:
|
|
2038
|
+
runtime_parameters.properties["features_for_embeddings"] = ",".join(features_for_transform)
|
|
1960
2039
|
|
|
1961
|
-
|
|
1962
|
-
|
|
1963
|
-
columns_for_system_record_id = sorted(list(search_keys.keys()) + (original_features_for_transform))
|
|
2040
|
+
columns_for_system_record_id = sorted(list(search_keys.keys()) + features_for_transform)
|
|
1964
2041
|
|
|
1965
2042
|
df[ENTITY_SYSTEM_RECORD_ID] = pd.util.hash_pandas_object(
|
|
1966
2043
|
df[columns_for_system_record_id], index=False
|
|
1967
2044
|
).astype("Float64")
|
|
1968
2045
|
|
|
1969
2046
|
# Explode multiple search keys
|
|
1970
|
-
df, unnest_search_keys = self._explode_multiple_search_keys(df, search_keys)
|
|
2047
|
+
df, unnest_search_keys = self._explode_multiple_search_keys(df, search_keys, columns_renaming)
|
|
1971
2048
|
|
|
1972
2049
|
email_column = self._get_email_column(search_keys)
|
|
1973
2050
|
hem_column = self._get_hem_column(search_keys)
|
|
1974
|
-
email_converted_to_hem = False
|
|
1975
2051
|
if email_column:
|
|
1976
2052
|
converter = EmailSearchKeyConverter(
|
|
1977
|
-
email_column,
|
|
2053
|
+
email_column,
|
|
2054
|
+
hem_column,
|
|
2055
|
+
search_keys,
|
|
2056
|
+
columns_renaming,
|
|
2057
|
+
list(unnest_search_keys.keys()),
|
|
2058
|
+
self.logger,
|
|
1978
2059
|
)
|
|
1979
2060
|
df = converter.convert(df)
|
|
1980
|
-
|
|
1981
|
-
|
|
2061
|
+
|
|
2062
|
+
ip_column = self._get_ip_column(search_keys)
|
|
2063
|
+
if ip_column:
|
|
2064
|
+
converter = IpSearchKeyConverter(
|
|
2065
|
+
ip_column,
|
|
2066
|
+
search_keys,
|
|
2067
|
+
columns_renaming,
|
|
2068
|
+
list(unnest_search_keys.keys()),
|
|
2069
|
+
self.bundle,
|
|
2070
|
+
self.logger,
|
|
2071
|
+
)
|
|
2072
|
+
df = converter.convert(df)
|
|
2073
|
+
|
|
2074
|
+
phone_column = self._get_phone_column(search_keys)
|
|
2075
|
+
country_column = self._get_country_column(search_keys)
|
|
2076
|
+
if phone_column:
|
|
2077
|
+
converter = PhoneSearchKeyConverter(phone_column, country_column)
|
|
2078
|
+
df = converter.convert(df)
|
|
2079
|
+
|
|
2080
|
+
if country_column:
|
|
2081
|
+
converter = CountrySearchKeyConverter(country_column)
|
|
2082
|
+
df = converter.convert(df)
|
|
2083
|
+
|
|
2084
|
+
postal_code = self._get_postal_column(search_keys)
|
|
2085
|
+
if postal_code:
|
|
2086
|
+
converter = PostalCodeSearchKeyConverter(postal_code)
|
|
2087
|
+
df = converter.convert(df)
|
|
2088
|
+
|
|
1982
2089
|
generated_features = [f for f in generated_features if f in self.fit_generated_features]
|
|
1983
2090
|
|
|
1984
2091
|
meaning_types = {col: key.value for col, key in search_keys.items()}
|
|
1985
|
-
|
|
1986
|
-
for col in original_features_for_transform:
|
|
2092
|
+
for col in features_for_transform:
|
|
1987
2093
|
meaning_types[col] = FileColumnMeaningType.FEATURE
|
|
1988
|
-
features_not_to_pass = [
|
|
1989
|
-
|
|
1990
|
-
|
|
1991
|
-
|
|
1992
|
-
|
|
1993
|
-
features_not_to_pass = [c for c in features_not_to_pass if c not in original_features_for_transform]
|
|
1994
|
-
columns_for_system_record_id = sorted(list(search_keys.keys()) + (original_features_for_transform))
|
|
2094
|
+
features_not_to_pass = [
|
|
2095
|
+
c
|
|
2096
|
+
for c in df.columns
|
|
2097
|
+
if c not in search_keys.keys() and c not in features_for_transform and c != ENTITY_SYSTEM_RECORD_ID
|
|
2098
|
+
]
|
|
1995
2099
|
|
|
1996
2100
|
if add_fit_system_record_id:
|
|
1997
|
-
df = self.__add_fit_system_record_id(df,
|
|
2101
|
+
df = self.__add_fit_system_record_id(df, search_keys, SYSTEM_RECORD_ID)
|
|
2102
|
+
if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
|
|
2103
|
+
df = df.drop(columns=DateTimeSearchKeyConverter.DATETIME_COL)
|
|
1998
2104
|
df = df.rename(columns={SYSTEM_RECORD_ID: SORT_ID})
|
|
1999
2105
|
features_not_to_pass.append(SORT_ID)
|
|
2000
2106
|
|
|
2001
|
-
|
|
2002
|
-
|
|
2107
|
+
# search keys might be changed after explode
|
|
2108
|
+
columns_for_system_record_id = sorted(list(search_keys.keys()) + features_for_transform)
|
|
2003
2109
|
df[SYSTEM_RECORD_ID] = pd.util.hash_pandas_object(df[columns_for_system_record_id], index=False).astype(
|
|
2004
2110
|
"Float64"
|
|
2005
2111
|
)
|
|
@@ -2035,8 +2141,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2035
2141
|
rest_client=self.rest_client,
|
|
2036
2142
|
logger=self.logger,
|
|
2037
2143
|
)
|
|
2038
|
-
|
|
2039
|
-
dataset.ignore_columns = [email_column]
|
|
2144
|
+
dataset.columns_renaming = columns_renaming
|
|
2040
2145
|
|
|
2041
2146
|
if max_features is not None or importance_threshold is not None:
|
|
2042
2147
|
exclude_features_sources = list(
|
|
@@ -2125,7 +2230,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2125
2230
|
result = enrich()
|
|
2126
2231
|
|
|
2127
2232
|
filtered_columns = self.__filtered_enriched_features(importance_threshold, max_features)
|
|
2128
|
-
existing_filtered_columns = [
|
|
2233
|
+
existing_filtered_columns = [
|
|
2234
|
+
c for c in filtered_columns if c in result.columns and c not in validated_X.columns
|
|
2235
|
+
]
|
|
2129
2236
|
selecting_columns = validated_X.columns.tolist() + generated_features + existing_filtered_columns
|
|
2130
2237
|
if add_fit_system_record_id:
|
|
2131
2238
|
selecting_columns.append(SORT_ID)
|
|
@@ -2138,7 +2245,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2138
2245
|
if add_fit_system_record_id:
|
|
2139
2246
|
result = result.rename(columns={SORT_ID: SYSTEM_RECORD_ID})
|
|
2140
2247
|
|
|
2141
|
-
return result
|
|
2248
|
+
return result, columns_renaming
|
|
2142
2249
|
|
|
2143
2250
|
def _get_excluded_features(self, max_features: Optional[int], importance_threshold: Optional[float]) -> List[str]:
|
|
2144
2251
|
features_info = self._internal_features_info
|
|
@@ -2239,6 +2346,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2239
2346
|
self.df_with_original_index = None
|
|
2240
2347
|
self.__cached_sampled_datasets = None
|
|
2241
2348
|
self.metrics = None
|
|
2349
|
+
self.fit_columns_renaming = None
|
|
2350
|
+
self.fit_dropped_features = set()
|
|
2351
|
+
self.fit_generated_features = []
|
|
2242
2352
|
|
|
2243
2353
|
validated_X = self._validate_X(X)
|
|
2244
2354
|
validated_y = self._validate_y(validated_X, y)
|
|
@@ -2285,9 +2395,10 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2285
2395
|
self.fit_search_keys = self.search_keys.copy()
|
|
2286
2396
|
self.fit_search_keys = self.__prepare_search_keys(validated_X, self.fit_search_keys, is_demo_dataset)
|
|
2287
2397
|
|
|
2288
|
-
maybe_date_column =
|
|
2398
|
+
maybe_date_column = SearchKey.find_key(self.fit_search_keys, [SearchKey.DATE, SearchKey.DATETIME])
|
|
2289
2399
|
has_date = maybe_date_column is not None
|
|
2290
2400
|
model_task_type = self.model_task_type or define_task(validated_y, has_date, self.logger)
|
|
2401
|
+
|
|
2291
2402
|
self._validate_binary_observations(validated_y, model_task_type)
|
|
2292
2403
|
|
|
2293
2404
|
self.runtime_parameters = get_runtime_params_custom_loss(
|
|
@@ -2317,7 +2428,13 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2317
2428
|
self.fit_generated_features = []
|
|
2318
2429
|
|
|
2319
2430
|
if has_date:
|
|
2320
|
-
converter = DateTimeSearchKeyConverter(
|
|
2431
|
+
converter = DateTimeSearchKeyConverter(
|
|
2432
|
+
maybe_date_column,
|
|
2433
|
+
self.date_format,
|
|
2434
|
+
self.logger,
|
|
2435
|
+
bundle=self.bundle,
|
|
2436
|
+
warnings_counter=self.warning_counter,
|
|
2437
|
+
)
|
|
2321
2438
|
df = converter.convert(df, keep_time=True)
|
|
2322
2439
|
self.logger.info(f"Date column after convertion: {df[maybe_date_column]}")
|
|
2323
2440
|
self.fit_generated_features.extend(converter.generated_features)
|
|
@@ -2326,6 +2443,12 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2326
2443
|
if self.add_date_if_missing:
|
|
2327
2444
|
df = self._add_current_date_as_key(df, self.fit_search_keys, self.logger, self.bundle)
|
|
2328
2445
|
|
|
2446
|
+
email_columns = SearchKey.find_all_keys(self.fit_search_keys, SearchKey.EMAIL)
|
|
2447
|
+
if email_columns:
|
|
2448
|
+
generator = EmailDomainGenerator(email_columns)
|
|
2449
|
+
df = generator.generate(df)
|
|
2450
|
+
self.fit_generated_features.extend(generator.generated_features)
|
|
2451
|
+
|
|
2329
2452
|
# Checks that need validated date
|
|
2330
2453
|
validate_dates_distribution(df, self.fit_search_keys, self.logger, self.bundle, self.warning_counter)
|
|
2331
2454
|
|
|
@@ -2334,7 +2457,12 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2334
2457
|
|
|
2335
2458
|
self.__adjust_cv(df, maybe_date_column, model_task_type)
|
|
2336
2459
|
|
|
2337
|
-
|
|
2460
|
+
normalizer = Normalizer(
|
|
2461
|
+
self.fit_search_keys, self.fit_generated_features, self.bundle, self.logger, self.warning_counter
|
|
2462
|
+
)
|
|
2463
|
+
df = normalizer.normalize(df)
|
|
2464
|
+
columns_renaming = normalizer.columns_renaming
|
|
2465
|
+
self.fit_columns_renaming = columns_renaming
|
|
2338
2466
|
|
|
2339
2467
|
df = remove_fintech_duplicates(
|
|
2340
2468
|
df, self.fit_search_keys, date_format=self.date_format, logger=self.logger, bundle=self.bundle
|
|
@@ -2342,38 +2470,58 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2342
2470
|
df = clean_full_duplicates(df, self.logger, bundle=self.bundle)
|
|
2343
2471
|
|
|
2344
2472
|
# Explode multiple search keys
|
|
2345
|
-
|
|
2346
|
-
meaning_types = {
|
|
2347
|
-
**{col: key.value for col, key in self.fit_search_keys.items()},
|
|
2348
|
-
**{str(c): FileColumnMeaningType.FEATURE for c in df.columns if c not in non_feature_columns},
|
|
2349
|
-
}
|
|
2350
|
-
meaning_types[self.TARGET_NAME] = FileColumnMeaningType.TARGET
|
|
2351
|
-
if eval_set is not None and len(eval_set) > 0:
|
|
2352
|
-
meaning_types[EVAL_SET_INDEX] = FileColumnMeaningType.EVAL_SET_INDEX
|
|
2353
|
-
df = self.__add_fit_system_record_id(df, meaning_types, self.fit_search_keys, ENTITY_SYSTEM_RECORD_ID)
|
|
2473
|
+
df = self.__add_fit_system_record_id(df, self.fit_search_keys, ENTITY_SYSTEM_RECORD_ID)
|
|
2354
2474
|
|
|
2355
2475
|
# TODO check that this is correct for enrichment
|
|
2356
2476
|
self.df_with_original_index = df.copy()
|
|
2477
|
+
# TODO check maybe need to drop _time column from df_with_original_index
|
|
2357
2478
|
|
|
2358
|
-
df, unnest_search_keys = self._explode_multiple_search_keys(df, self.fit_search_keys)
|
|
2479
|
+
df, unnest_search_keys = self._explode_multiple_search_keys(df, self.fit_search_keys, columns_renaming)
|
|
2359
2480
|
|
|
2360
2481
|
# Convert EMAIL to HEM after unnesting to do it only with one column
|
|
2361
2482
|
email_column = self._get_email_column(self.fit_search_keys)
|
|
2362
2483
|
hem_column = self._get_hem_column(self.fit_search_keys)
|
|
2363
|
-
email_converted_to_hem = False
|
|
2364
2484
|
if email_column:
|
|
2365
2485
|
converter = EmailSearchKeyConverter(
|
|
2366
|
-
email_column,
|
|
2486
|
+
email_column,
|
|
2487
|
+
hem_column,
|
|
2488
|
+
self.fit_search_keys,
|
|
2489
|
+
columns_renaming,
|
|
2490
|
+
list(unnest_search_keys.keys()),
|
|
2491
|
+
self.logger,
|
|
2367
2492
|
)
|
|
2368
2493
|
df = converter.convert(df)
|
|
2369
|
-
|
|
2370
|
-
|
|
2494
|
+
|
|
2495
|
+
ip_column = self._get_ip_column(self.fit_search_keys)
|
|
2496
|
+
if ip_column:
|
|
2497
|
+
converter = IpSearchKeyConverter(
|
|
2498
|
+
ip_column,
|
|
2499
|
+
self.fit_search_keys,
|
|
2500
|
+
columns_renaming,
|
|
2501
|
+
list(unnest_search_keys.keys()),
|
|
2502
|
+
self.bundle,
|
|
2503
|
+
self.logger,
|
|
2504
|
+
)
|
|
2505
|
+
df = converter.convert(df)
|
|
2506
|
+
|
|
2507
|
+
phone_column = self._get_phone_column(self.fit_search_keys)
|
|
2508
|
+
country_column = self._get_country_column(self.fit_search_keys)
|
|
2509
|
+
if phone_column:
|
|
2510
|
+
converter = PhoneSearchKeyConverter(phone_column, country_column)
|
|
2511
|
+
df = converter.convert(df)
|
|
2512
|
+
|
|
2513
|
+
if country_column:
|
|
2514
|
+
converter = CountrySearchKeyConverter(country_column)
|
|
2515
|
+
df = converter.convert(df)
|
|
2516
|
+
|
|
2517
|
+
postal_code = self._get_postal_column(self.fit_search_keys)
|
|
2518
|
+
if postal_code:
|
|
2519
|
+
converter = PostalCodeSearchKeyConverter(postal_code)
|
|
2520
|
+
df = converter.convert(df)
|
|
2371
2521
|
|
|
2372
2522
|
non_feature_columns = [self.TARGET_NAME, EVAL_SET_INDEX, ENTITY_SYSTEM_RECORD_ID, SEARCH_KEY_UNNEST] + list(
|
|
2373
2523
|
self.fit_search_keys.keys()
|
|
2374
2524
|
)
|
|
2375
|
-
if email_converted_to_hem:
|
|
2376
|
-
non_feature_columns.append(email_column)
|
|
2377
2525
|
if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
|
|
2378
2526
|
non_feature_columns.append(DateTimeSearchKeyConverter.DATETIME_COL)
|
|
2379
2527
|
|
|
@@ -2385,9 +2533,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2385
2533
|
self.fit_dropped_features.update(features_to_drop)
|
|
2386
2534
|
df = df.drop(columns=features_to_drop)
|
|
2387
2535
|
|
|
2388
|
-
if email_converted_to_hem:
|
|
2389
|
-
self.fit_dropped_features.add(email_column)
|
|
2390
|
-
|
|
2391
2536
|
self.fit_generated_features = [f for f in self.fit_generated_features if f not in self.fit_dropped_features]
|
|
2392
2537
|
|
|
2393
2538
|
meaning_types = {
|
|
@@ -2401,7 +2546,12 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2401
2546
|
if eval_set is not None and len(eval_set) > 0:
|
|
2402
2547
|
meaning_types[EVAL_SET_INDEX] = FileColumnMeaningType.EVAL_SET_INDEX
|
|
2403
2548
|
|
|
2404
|
-
df = self.__add_fit_system_record_id(df,
|
|
2549
|
+
df = self.__add_fit_system_record_id(df, self.fit_search_keys, SYSTEM_RECORD_ID)
|
|
2550
|
+
|
|
2551
|
+
if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
|
|
2552
|
+
df = df.drop(columns=DateTimeSearchKeyConverter.DATETIME_COL)
|
|
2553
|
+
|
|
2554
|
+
meaning_types[SYSTEM_RECORD_ID] = FileColumnMeaningType.SYSTEM_RECORD_ID
|
|
2405
2555
|
|
|
2406
2556
|
df = df.reset_index(drop=True).sort_values(by=SYSTEM_RECORD_ID).reset_index(drop=True)
|
|
2407
2557
|
|
|
@@ -2419,8 +2569,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2419
2569
|
rest_client=self.rest_client,
|
|
2420
2570
|
logger=self.logger,
|
|
2421
2571
|
)
|
|
2422
|
-
|
|
2423
|
-
dataset.ignore_columns = [email_column]
|
|
2572
|
+
dataset.columns_renaming = columns_renaming
|
|
2424
2573
|
|
|
2425
2574
|
self.passed_features = [
|
|
2426
2575
|
column for column, meaning_type in meaning_types.items() if meaning_type == FileColumnMeaningType.FEATURE
|
|
@@ -2809,7 +2958,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2809
2958
|
if DateTimeSearchKeyConverter.DATETIME_COL in X.columns:
|
|
2810
2959
|
date_column = DateTimeSearchKeyConverter.DATETIME_COL
|
|
2811
2960
|
else:
|
|
2812
|
-
date_column =
|
|
2961
|
+
date_column = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
|
|
2813
2962
|
sort_columns = [date_column] if date_column is not None else []
|
|
2814
2963
|
|
|
2815
2964
|
# Xy = pd.concat([X, y], axis=1)
|
|
@@ -2905,10 +3054,10 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2905
3054
|
|
|
2906
3055
|
do_without_pandas_limits(print_datasets_sample)
|
|
2907
3056
|
|
|
2908
|
-
maybe_date_col =
|
|
3057
|
+
maybe_date_col = SearchKey.find_key(self.search_keys, [SearchKey.DATE, SearchKey.DATETIME])
|
|
2909
3058
|
if X is not None and maybe_date_col is not None and maybe_date_col in X.columns:
|
|
2910
3059
|
# TODO cast date column to single dtype
|
|
2911
|
-
date_converter = DateTimeSearchKeyConverter(maybe_date_col, self.date_format)
|
|
3060
|
+
date_converter = DateTimeSearchKeyConverter(maybe_date_col, self.date_format, silent_mode=True)
|
|
2912
3061
|
converted_X = date_converter.convert(X)
|
|
2913
3062
|
min_date = converted_X[maybe_date_col].min()
|
|
2914
3063
|
max_date = converted_X[maybe_date_col].max()
|
|
@@ -2935,12 +3084,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2935
3084
|
|
|
2936
3085
|
return df
|
|
2937
3086
|
|
|
2938
|
-
@staticmethod
|
|
2939
|
-
def _get_date_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
|
|
2940
|
-
for col, t in search_keys.items():
|
|
2941
|
-
if t in [SearchKey.DATE, SearchKey.DATETIME]:
|
|
2942
|
-
return col
|
|
2943
|
-
|
|
2944
3087
|
@staticmethod
|
|
2945
3088
|
def _add_current_date_as_key(
|
|
2946
3089
|
df: pd.DataFrame, search_keys: Dict[str, SearchKey], logger: logging.Logger, bundle: ResourceBundle
|
|
@@ -2956,7 +3099,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2956
3099
|
logger.warning(msg)
|
|
2957
3100
|
df[FeaturesEnricher.CURRENT_DATE] = datetime.date.today()
|
|
2958
3101
|
search_keys[FeaturesEnricher.CURRENT_DATE] = SearchKey.DATE
|
|
2959
|
-
converter = DateTimeSearchKeyConverter(FeaturesEnricher.CURRENT_DATE,
|
|
3102
|
+
converter = DateTimeSearchKeyConverter(FeaturesEnricher.CURRENT_DATE, silent_mode=True)
|
|
2960
3103
|
df = converter.convert(df)
|
|
2961
3104
|
return df
|
|
2962
3105
|
|
|
@@ -2984,17 +3127,37 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2984
3127
|
if len(cols) == 1:
|
|
2985
3128
|
return cols[0]
|
|
2986
3129
|
|
|
3130
|
+
@staticmethod
|
|
3131
|
+
def _get_ip_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
|
|
3132
|
+
cols = [col for col, t in search_keys.items() if t == SearchKey.IP]
|
|
3133
|
+
if len(cols) > 1:
|
|
3134
|
+
raise Exception("More than one ip column found after unnest")
|
|
3135
|
+
if len(cols) == 1:
|
|
3136
|
+
return cols[0]
|
|
3137
|
+
|
|
2987
3138
|
@staticmethod
|
|
2988
3139
|
def _get_phone_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
|
|
2989
3140
|
for col, t in search_keys.items():
|
|
2990
3141
|
if t == SearchKey.PHONE:
|
|
2991
3142
|
return col
|
|
2992
3143
|
|
|
3144
|
+
@staticmethod
|
|
3145
|
+
def _get_country_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
|
|
3146
|
+
for col, t in search_keys.items():
|
|
3147
|
+
if t == SearchKey.COUNTRY:
|
|
3148
|
+
return col
|
|
3149
|
+
|
|
3150
|
+
@staticmethod
|
|
3151
|
+
def _get_postal_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
|
|
3152
|
+
for col, t in search_keys.items():
|
|
3153
|
+
if t == SearchKey.POSTAL_CODE:
|
|
3154
|
+
return col
|
|
3155
|
+
|
|
2993
3156
|
def _explode_multiple_search_keys(
|
|
2994
|
-
self, df: pd.DataFrame, search_keys: Dict[str, SearchKey]
|
|
3157
|
+
self, df: pd.DataFrame, search_keys: Dict[str, SearchKey], columns_renaming: Dict[str, str]
|
|
2995
3158
|
) -> Tuple[pd.DataFrame, Dict[str, List[str]]]:
|
|
2996
3159
|
# find groups of multiple search keys
|
|
2997
|
-
search_key_names_by_type: Dict[SearchKey, str] = {}
|
|
3160
|
+
search_key_names_by_type: Dict[SearchKey, List[str]] = {}
|
|
2998
3161
|
for key_name, key_type in search_keys.items():
|
|
2999
3162
|
search_key_names_by_type[key_type] = search_key_names_by_type.get(key_type, []) + [key_name]
|
|
3000
3163
|
search_key_names_by_type = {
|
|
@@ -3003,6 +3166,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3003
3166
|
if len(search_key_names_by_type) == 0:
|
|
3004
3167
|
return df, {}
|
|
3005
3168
|
|
|
3169
|
+
self.logger.info(f"Start exploding dataset by {search_key_names_by_type}. Size before: {len(df)}")
|
|
3006
3170
|
multiple_keys_columns = [col for cols in search_key_names_by_type.values() for col in cols]
|
|
3007
3171
|
other_columns = [col for col in df.columns if col not in multiple_keys_columns]
|
|
3008
3172
|
exploded_dfs = []
|
|
@@ -3018,14 +3182,16 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3018
3182
|
del search_keys[old_key]
|
|
3019
3183
|
search_keys[new_search_key] = key_type
|
|
3020
3184
|
unnest_search_keys[new_search_key] = key_names
|
|
3185
|
+
columns_renaming[new_search_key] = new_search_key
|
|
3021
3186
|
|
|
3022
3187
|
df = pd.concat(exploded_dfs, ignore_index=True)
|
|
3188
|
+
self.logger.info(f"Finished explosion. Size after: {len(df)}")
|
|
3023
3189
|
return df, unnest_search_keys
|
|
3024
3190
|
|
|
3025
3191
|
def __add_fit_system_record_id(
|
|
3026
3192
|
self,
|
|
3027
3193
|
df: pd.DataFrame,
|
|
3028
|
-
meaning_types: Dict[str, FileColumnMeaningType],
|
|
3194
|
+
# meaning_types: Dict[str, FileColumnMeaningType],
|
|
3029
3195
|
search_keys: Dict[str, SearchKey],
|
|
3030
3196
|
id_name: str,
|
|
3031
3197
|
) -> pd.DataFrame:
|
|
@@ -3048,39 +3214,37 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3048
3214
|
]
|
|
3049
3215
|
if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
|
|
3050
3216
|
date_column = DateTimeSearchKeyConverter.DATETIME_COL
|
|
3051
|
-
sort_exclude_columns.append(
|
|
3217
|
+
sort_exclude_columns.append(SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME]))
|
|
3052
3218
|
else:
|
|
3053
|
-
date_column =
|
|
3219
|
+
date_column = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
|
|
3054
3220
|
sort_columns = [date_column] if date_column is not None else []
|
|
3055
3221
|
|
|
3222
|
+
sorted_other_keys = sorted(search_keys, key=lambda x: str(search_keys.get(x)))
|
|
3223
|
+
sorted_other_keys = [k for k in sorted_other_keys if k not in sort_exclude_columns]
|
|
3224
|
+
|
|
3056
3225
|
other_columns = sorted(
|
|
3057
3226
|
[
|
|
3058
3227
|
c
|
|
3059
3228
|
for c in df.columns
|
|
3060
|
-
if c not in sort_columns
|
|
3229
|
+
if c not in sort_columns
|
|
3230
|
+
and c not in sorted_other_keys
|
|
3231
|
+
and c not in sort_exclude_columns
|
|
3232
|
+
and df[c].nunique() > 1
|
|
3061
3233
|
]
|
|
3062
|
-
# [
|
|
3063
|
-
# sk
|
|
3064
|
-
# for sk, key_type in search_keys.items()
|
|
3065
|
-
# if key_type not in [SearchKey.DATE, SearchKey.DATETIME]
|
|
3066
|
-
# and sk in df.columns
|
|
3067
|
-
# and df[sk].nunique() > 1 # don't use constant keys for hash
|
|
3068
|
-
# ]
|
|
3069
3234
|
)
|
|
3070
3235
|
|
|
3236
|
+
all_other_columns = sorted_other_keys + other_columns
|
|
3237
|
+
|
|
3071
3238
|
search_keys_hash = "search_keys_hash"
|
|
3072
|
-
if len(
|
|
3239
|
+
if len(all_other_columns) > 0:
|
|
3073
3240
|
sort_columns.append(search_keys_hash)
|
|
3074
|
-
df[search_keys_hash] = pd.util.hash_pandas_object(df[
|
|
3241
|
+
df[search_keys_hash] = pd.util.hash_pandas_object(df[all_other_columns], index=False)
|
|
3075
3242
|
|
|
3076
3243
|
df = df.sort_values(by=sort_columns)
|
|
3077
3244
|
|
|
3078
3245
|
if search_keys_hash in df.columns:
|
|
3079
3246
|
df.drop(columns=search_keys_hash, inplace=True)
|
|
3080
3247
|
|
|
3081
|
-
if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
|
|
3082
|
-
df.drop(columns=DateTimeSearchKeyConverter.DATETIME_COL, inplace=True)
|
|
3083
|
-
|
|
3084
3248
|
df = df.reset_index(drop=True).reset_index()
|
|
3085
3249
|
# system_record_id saves correct order for fit
|
|
3086
3250
|
df = df.rename(columns={DEFAULT_INDEX: id_name})
|
|
@@ -3090,11 +3254,11 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3090
3254
|
df.index.name = original_index_name
|
|
3091
3255
|
df = df.sort_values(by=original_order_name).drop(columns=original_order_name)
|
|
3092
3256
|
|
|
3093
|
-
meaning_types[id_name] = (
|
|
3094
|
-
|
|
3095
|
-
|
|
3096
|
-
|
|
3097
|
-
)
|
|
3257
|
+
# meaning_types[id_name] = (
|
|
3258
|
+
# FileColumnMeaningType.SYSTEM_RECORD_ID
|
|
3259
|
+
# if id_name == SYSTEM_RECORD_ID
|
|
3260
|
+
# else FileColumnMeaningType.ENTITY_SYSTEM_RECORD_ID
|
|
3261
|
+
# )
|
|
3098
3262
|
return df
|
|
3099
3263
|
|
|
3100
3264
|
def __correct_target(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
@@ -3472,10 +3636,13 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3472
3636
|
for _, key_type in search_keys.items():
|
|
3473
3637
|
if not isinstance(key_type, SearchKey):
|
|
3474
3638
|
raise ValidationError(self.bundle.get("unsupported_type_of_search_key").format(key_type))
|
|
3639
|
+
|
|
3475
3640
|
valid_search_keys = {}
|
|
3476
3641
|
unsupported_search_keys = {
|
|
3477
3642
|
SearchKey.IP_RANGE_FROM,
|
|
3478
3643
|
SearchKey.IP_RANGE_TO,
|
|
3644
|
+
SearchKey.IPV6_RANGE_FROM,
|
|
3645
|
+
SearchKey.IPV6_RANGE_TO,
|
|
3479
3646
|
SearchKey.MSISDN_RANGE_FROM,
|
|
3480
3647
|
SearchKey.MSISDN_RANGE_TO,
|
|
3481
3648
|
# SearchKey.EMAIL_ONE_DOMAIN,
|
|
@@ -3565,6 +3732,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3565
3732
|
print(msg)
|
|
3566
3733
|
self.logger.warning(msg)
|
|
3567
3734
|
self.warning_counter.increment()
|
|
3735
|
+
# TODO maybe raise ValidationError
|
|
3568
3736
|
|
|
3569
3737
|
self.logger.info(f"Prepared search keys: {valid_search_keys}")
|
|
3570
3738
|
|