upgini 1.1.315a3579.dev1__py3-none-any.whl → 1.1.316a1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- upgini/__about__.py +1 -1
- upgini/autofe/binary.py +71 -71
- upgini/autofe/date.py +21 -21
- upgini/autofe/feature.py +2 -2
- upgini/autofe/groupby.py +22 -22
- upgini/autofe/operand.py +4 -4
- upgini/autofe/unary.py +47 -46
- upgini/autofe/vector.py +8 -8
- upgini/data_source/data_source_publisher.py +9 -0
- upgini/dataset.py +34 -387
- upgini/features_enricher.py +338 -169
- upgini/http.py +20 -31
- upgini/lazy_import.py +14 -1
- upgini/metadata.py +72 -57
- upgini/normalizer/normalize_utils.py +202 -0
- upgini/utils/country_utils.py +16 -0
- upgini/utils/datetime_utils.py +41 -20
- upgini/utils/email_utils.py +49 -17
- upgini/utils/ip_utils.py +100 -1
- upgini/utils/phone_utils.py +343 -0
- upgini/utils/postal_code_utils.py +34 -0
- upgini/utils/target_utils.py +4 -1
- {upgini-1.1.315a3579.dev1.dist-info → upgini-1.1.316a1.dist-info}/METADATA +3 -3
- {upgini-1.1.315a3579.dev1.dist-info → upgini-1.1.316a1.dist-info}/RECORD +26 -26
- {upgini-1.1.315a3579.dev1.dist-info → upgini-1.1.316a1.dist-info}/WHEEL +1 -1
- upgini/normalizer/phone_normalizer.py +0 -340
- {upgini-1.1.315a3579.dev1.dist-info → upgini-1.1.316a1.dist-info}/licenses/LICENSE +0 -0
upgini/features_enricher.py
CHANGED
|
@@ -23,7 +23,6 @@ from pandas.api.types import (
|
|
|
23
23
|
is_datetime64_any_dtype,
|
|
24
24
|
is_numeric_dtype,
|
|
25
25
|
is_object_dtype,
|
|
26
|
-
is_period_dtype,
|
|
27
26
|
is_string_dtype,
|
|
28
27
|
)
|
|
29
28
|
from scipy.stats import ks_2samp
|
|
@@ -61,11 +60,15 @@ from upgini.metadata import (
|
|
|
61
60
|
SearchKey,
|
|
62
61
|
)
|
|
63
62
|
from upgini.metrics import EstimatorWrapper, validate_scoring_argument
|
|
63
|
+
from upgini.normalizer.normalize_utils import Normalizer
|
|
64
64
|
from upgini.resource_bundle import ResourceBundle, bundle, get_custom_bundle
|
|
65
65
|
from upgini.search_task import SearchTask
|
|
66
66
|
from upgini.spinner import Spinner
|
|
67
67
|
from upgini.utils import combine_search_keys, find_numbers_with_decimal_comma
|
|
68
|
-
from upgini.utils.country_utils import
|
|
68
|
+
from upgini.utils.country_utils import (
|
|
69
|
+
CountrySearchKeyConverter,
|
|
70
|
+
CountrySearchKeyDetector,
|
|
71
|
+
)
|
|
69
72
|
from upgini.utils.custom_loss_utils import (
|
|
70
73
|
get_additional_params_custom_loss,
|
|
71
74
|
get_runtime_params_custom_loss,
|
|
@@ -87,11 +90,19 @@ from upgini.utils.display_utils import (
|
|
|
87
90
|
prepare_and_show_report,
|
|
88
91
|
show_request_quote_button,
|
|
89
92
|
)
|
|
90
|
-
from upgini.utils.email_utils import
|
|
93
|
+
from upgini.utils.email_utils import (
|
|
94
|
+
EmailDomainGenerator,
|
|
95
|
+
EmailSearchKeyConverter,
|
|
96
|
+
EmailSearchKeyDetector,
|
|
97
|
+
)
|
|
91
98
|
from upgini.utils.features_validator import FeaturesValidator
|
|
92
99
|
from upgini.utils.format import Format
|
|
93
|
-
from upgini.utils.
|
|
94
|
-
from upgini.utils.
|
|
100
|
+
from upgini.utils.ip_utils import IpSearchKeyConverter
|
|
101
|
+
from upgini.utils.phone_utils import PhoneSearchKeyConverter, PhoneSearchKeyDetector
|
|
102
|
+
from upgini.utils.postal_code_utils import (
|
|
103
|
+
PostalCodeSearchKeyConverter,
|
|
104
|
+
PostalCodeSearchKeyDetector,
|
|
105
|
+
)
|
|
95
106
|
|
|
96
107
|
try:
|
|
97
108
|
from upgini.utils.progress_bar import CustomProgressBar as ProgressBar
|
|
@@ -237,6 +248,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
237
248
|
|
|
238
249
|
self.passed_features: List[str] = []
|
|
239
250
|
self.df_with_original_index: Optional[pd.DataFrame] = None
|
|
251
|
+
self.fit_columns_renaming: Optional[Dict[str, str]] = None
|
|
240
252
|
self.country_added = False
|
|
241
253
|
self.fit_generated_features: List[str] = []
|
|
242
254
|
self.fit_dropped_features: Set[str] = set()
|
|
@@ -247,7 +259,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
247
259
|
self.eval_set: Optional[List[Tuple]] = None
|
|
248
260
|
self.autodetected_search_keys: Dict[str, SearchKey] = {}
|
|
249
261
|
self.imbalanced = False
|
|
250
|
-
self.__cached_sampled_datasets: Optional[Tuple[pd.DataFrame, pd.DataFrame, pd.Series, Dict, Dict]] = None
|
|
262
|
+
self.__cached_sampled_datasets: Optional[Tuple[pd.DataFrame, pd.DataFrame, pd.Series, Dict, Dict, Dict]] = None
|
|
251
263
|
|
|
252
264
|
validate_version(self.logger)
|
|
253
265
|
self.search_keys = search_keys or {}
|
|
@@ -706,7 +718,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
706
718
|
|
|
707
719
|
start_time = time.time()
|
|
708
720
|
try:
|
|
709
|
-
result = self.__inner_transform(
|
|
721
|
+
result, _ = self.__inner_transform(
|
|
710
722
|
trace_id,
|
|
711
723
|
X,
|
|
712
724
|
exclude_features_sources=exclude_features_sources,
|
|
@@ -833,17 +845,37 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
833
845
|
self.logger.warning(msg)
|
|
834
846
|
print(msg)
|
|
835
847
|
|
|
848
|
+
if X is not None and y is None:
|
|
849
|
+
raise ValidationError("X passed without y")
|
|
850
|
+
|
|
836
851
|
self.__validate_search_keys(self.search_keys, self.search_id)
|
|
837
852
|
effective_X = X if X is not None else self.X
|
|
838
853
|
effective_y = y if y is not None else self.y
|
|
839
854
|
effective_eval_set = eval_set if eval_set is not None else self.eval_set
|
|
840
855
|
effective_eval_set = self._check_eval_set(effective_eval_set, effective_X, self.bundle)
|
|
841
856
|
|
|
857
|
+
if (
|
|
858
|
+
self._search_task is None
|
|
859
|
+
or self._search_task.provider_metadata_v2 is None
|
|
860
|
+
or len(self._search_task.provider_metadata_v2) == 0
|
|
861
|
+
or effective_X is None
|
|
862
|
+
or effective_y is None
|
|
863
|
+
):
|
|
864
|
+
raise ValidationError(self.bundle.get("metrics_unfitted_enricher"))
|
|
865
|
+
|
|
866
|
+
validated_X = self._validate_X(effective_X)
|
|
867
|
+
validated_y = self._validate_y(validated_X, effective_y)
|
|
868
|
+
validated_eval_set = (
|
|
869
|
+
[self._validate_eval_set_pair(validated_X, eval_pair) for eval_pair in effective_eval_set]
|
|
870
|
+
if effective_eval_set is not None
|
|
871
|
+
else None
|
|
872
|
+
)
|
|
873
|
+
|
|
842
874
|
try:
|
|
843
875
|
self.__log_debug_information(
|
|
844
|
-
|
|
845
|
-
|
|
846
|
-
|
|
876
|
+
validated_X,
|
|
877
|
+
validated_y,
|
|
878
|
+
validated_eval_set,
|
|
847
879
|
exclude_features_sources=exclude_features_sources,
|
|
848
880
|
cv=cv if cv is not None else self.cv,
|
|
849
881
|
importance_threshold=importance_threshold,
|
|
@@ -853,21 +885,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
853
885
|
remove_outliers_calc_metrics=remove_outliers_calc_metrics,
|
|
854
886
|
)
|
|
855
887
|
|
|
856
|
-
if (
|
|
857
|
-
self._search_task is None
|
|
858
|
-
or self._search_task.provider_metadata_v2 is None
|
|
859
|
-
or len(self._search_task.provider_metadata_v2) == 0
|
|
860
|
-
or effective_X is None
|
|
861
|
-
or effective_y is None
|
|
862
|
-
):
|
|
863
|
-
raise ValidationError(self.bundle.get("metrics_unfitted_enricher"))
|
|
864
|
-
|
|
865
|
-
if X is not None and y is None:
|
|
866
|
-
raise ValidationError("X passed without y")
|
|
867
|
-
|
|
868
888
|
validate_scoring_argument(scoring)
|
|
869
889
|
|
|
870
|
-
self._validate_baseline_score(
|
|
890
|
+
self._validate_baseline_score(validated_X, validated_eval_set)
|
|
871
891
|
|
|
872
892
|
if self._has_paid_features(exclude_features_sources):
|
|
873
893
|
msg = self.bundle.get("metrics_with_paid_features")
|
|
@@ -876,7 +896,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
876
896
|
return None
|
|
877
897
|
|
|
878
898
|
cat_features, search_keys_for_metrics = self._get_client_cat_features(
|
|
879
|
-
estimator,
|
|
899
|
+
estimator, validated_X, self.search_keys
|
|
880
900
|
)
|
|
881
901
|
|
|
882
902
|
prepared_data = self._prepare_data_for_metrics(
|
|
@@ -906,8 +926,16 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
906
926
|
search_keys,
|
|
907
927
|
groups,
|
|
908
928
|
_cv,
|
|
929
|
+
columns_renaming,
|
|
909
930
|
) = prepared_data
|
|
910
931
|
|
|
932
|
+
# rename cat_features
|
|
933
|
+
if cat_features:
|
|
934
|
+
for new_c, old_c in columns_renaming.items():
|
|
935
|
+
if old_c in cat_features:
|
|
936
|
+
cat_features.remove(old_c)
|
|
937
|
+
cat_features.append(new_c)
|
|
938
|
+
|
|
911
939
|
gc.collect()
|
|
912
940
|
|
|
913
941
|
print(self.bundle.get("metrics_start"))
|
|
@@ -920,7 +948,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
920
948
|
|
|
921
949
|
self._check_train_and_eval_target_distribution(y_sorted, fitting_eval_set_dict)
|
|
922
950
|
|
|
923
|
-
has_date =
|
|
951
|
+
has_date = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME]) is not None
|
|
924
952
|
model_task_type = self.model_task_type or define_task(y_sorted, has_date, self.logger, silent=True)
|
|
925
953
|
|
|
926
954
|
wrapper = EstimatorWrapper.create(
|
|
@@ -1013,10 +1041,10 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1013
1041
|
self.bundle.get("quality_metrics_rows_header"): _num_samples(effective_X),
|
|
1014
1042
|
}
|
|
1015
1043
|
if model_task_type in [ModelTaskType.BINARY, ModelTaskType.REGRESSION] and is_numeric_dtype(
|
|
1016
|
-
|
|
1044
|
+
validated_y
|
|
1017
1045
|
):
|
|
1018
1046
|
train_metrics[self.bundle.get("quality_metrics_mean_target_header")] = round(
|
|
1019
|
-
np.mean(
|
|
1047
|
+
np.mean(validated_y), 4
|
|
1020
1048
|
)
|
|
1021
1049
|
if etalon_metric is not None:
|
|
1022
1050
|
train_metrics[self.bundle.get("quality_metrics_baseline_header").format(metric)] = etalon_metric
|
|
@@ -1086,10 +1114,10 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1086
1114
|
# self.bundle.get("quality_metrics_match_rate_header"): eval_hit_rate,
|
|
1087
1115
|
}
|
|
1088
1116
|
if model_task_type in [ModelTaskType.BINARY, ModelTaskType.REGRESSION] and is_numeric_dtype(
|
|
1089
|
-
|
|
1117
|
+
validated_eval_set[idx][1]
|
|
1090
1118
|
):
|
|
1091
1119
|
eval_metrics[self.bundle.get("quality_metrics_mean_target_header")] = round(
|
|
1092
|
-
np.mean(
|
|
1120
|
+
np.mean(validated_eval_set[idx][1]), 4
|
|
1093
1121
|
)
|
|
1094
1122
|
if etalon_eval_metric is not None:
|
|
1095
1123
|
eval_metrics[self.bundle.get("quality_metrics_baseline_header").format(metric)] = (
|
|
@@ -1113,7 +1141,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1113
1141
|
)
|
|
1114
1142
|
|
|
1115
1143
|
uplift_col = self.bundle.get("quality_metrics_uplift_header")
|
|
1116
|
-
date_column =
|
|
1144
|
+
date_column = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
|
|
1117
1145
|
if (
|
|
1118
1146
|
uplift_col in metrics_df.columns
|
|
1119
1147
|
and (metrics_df[uplift_col] < 0).any()
|
|
@@ -1195,27 +1223,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1195
1223
|
def _has_paid_features(self, exclude_features_sources: Optional[List[str]]) -> bool:
|
|
1196
1224
|
return self._has_features_with_commercial_schema(CommercialSchema.PAID.value, exclude_features_sources)
|
|
1197
1225
|
|
|
1198
|
-
def _extend_x(self, x: pd.DataFrame, is_demo_dataset: bool) -> Tuple[pd.DataFrame, Dict[str, SearchKey]]:
|
|
1199
|
-
search_keys = self.search_keys.copy()
|
|
1200
|
-
search_keys = self.__prepare_search_keys(x, search_keys, is_demo_dataset, is_transform=True, silent_mode=True)
|
|
1201
|
-
|
|
1202
|
-
extended_X = x.copy()
|
|
1203
|
-
generated_features = []
|
|
1204
|
-
date_column = self._get_date_column(search_keys)
|
|
1205
|
-
if date_column is not None:
|
|
1206
|
-
converter = DateTimeSearchKeyConverter(date_column, self.date_format, self.logger, self.bundle)
|
|
1207
|
-
extended_X = converter.convert(extended_X, keep_time=True)
|
|
1208
|
-
generated_features.extend(converter.generated_features)
|
|
1209
|
-
email_column = self._get_email_column(search_keys)
|
|
1210
|
-
hem_column = self._get_hem_column(search_keys)
|
|
1211
|
-
if email_column:
|
|
1212
|
-
converter = EmailSearchKeyConverter(email_column, hem_column, search_keys, [], self.logger)
|
|
1213
|
-
extended_X = converter.convert(extended_X)
|
|
1214
|
-
generated_features.extend(converter.generated_features)
|
|
1215
|
-
generated_features = [f for f in generated_features if f in self.fit_generated_features]
|
|
1216
|
-
|
|
1217
|
-
return extended_X, search_keys
|
|
1218
|
-
|
|
1219
1226
|
def _is_input_same_as_fit(
|
|
1220
1227
|
self,
|
|
1221
1228
|
X: Union[pd.DataFrame, pd.Series, np.ndarray, None] = None,
|
|
@@ -1259,7 +1266,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1259
1266
|
groups = None
|
|
1260
1267
|
|
|
1261
1268
|
if not isinstance(_cv, BaseCrossValidator):
|
|
1262
|
-
date_column =
|
|
1269
|
+
date_column = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
|
|
1263
1270
|
date_series = X[date_column] if date_column is not None else None
|
|
1264
1271
|
_cv, groups = CVConfig(
|
|
1265
1272
|
_cv, date_series, self.random_state, self._search_task.get_shuffle_kfold(), group_columns=group_columns
|
|
@@ -1282,7 +1289,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1282
1289
|
|
|
1283
1290
|
def _get_client_cat_features(
|
|
1284
1291
|
self, estimator: Optional[Any], X: pd.DataFrame, search_keys: Dict[str, SearchKey]
|
|
1285
|
-
) -> Optional[List[str]]:
|
|
1292
|
+
) -> Tuple[Optional[List[str]], List[str]]:
|
|
1286
1293
|
cat_features = None
|
|
1287
1294
|
search_keys_for_metrics = []
|
|
1288
1295
|
if (
|
|
@@ -1342,11 +1349,15 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1342
1349
|
progress_bar,
|
|
1343
1350
|
progress_callback,
|
|
1344
1351
|
)
|
|
1345
|
-
X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys = dataclasses.astuple(
|
|
1352
|
+
X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys, columns_renaming = dataclasses.astuple(
|
|
1353
|
+
sampled_data
|
|
1354
|
+
)
|
|
1346
1355
|
|
|
1347
1356
|
excluding_search_keys = list(search_keys.keys())
|
|
1348
1357
|
if search_keys_for_metrics is not None and len(search_keys_for_metrics) > 0:
|
|
1349
|
-
|
|
1358
|
+
for sk in excluding_search_keys:
|
|
1359
|
+
if columns_renaming.get(sk) in search_keys_for_metrics:
|
|
1360
|
+
excluding_search_keys.remove(sk)
|
|
1350
1361
|
|
|
1351
1362
|
client_features = [
|
|
1352
1363
|
c
|
|
@@ -1363,6 +1374,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1363
1374
|
importance_threshold,
|
|
1364
1375
|
max_features,
|
|
1365
1376
|
)
|
|
1377
|
+
filtered_enriched_features = [c for c in filtered_enriched_features if c not in client_features]
|
|
1366
1378
|
|
|
1367
1379
|
X_sorted, y_sorted = self._sort_by_system_record_id(X_sampled, y_sampled, self.cv)
|
|
1368
1380
|
enriched_X_sorted, enriched_y_sorted = self._sort_by_system_record_id(enriched_X, y_sampled, self.cv)
|
|
@@ -1392,9 +1404,12 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1392
1404
|
fitting_X = fitting_X.drop(columns=constant_columns, errors="ignore")
|
|
1393
1405
|
fitting_enriched_X = fitting_enriched_X.drop(columns=constant_columns, errors="ignore")
|
|
1394
1406
|
|
|
1407
|
+
# TODO maybe there is no more need for these convertions
|
|
1395
1408
|
# Remove datetime features
|
|
1396
1409
|
datetime_features = [
|
|
1397
|
-
f
|
|
1410
|
+
f
|
|
1411
|
+
for f in fitting_X.columns
|
|
1412
|
+
if is_datetime64_any_dtype(fitting_X[f]) or isinstance(fitting_X[f].dtype, pd.PeriodDtype)
|
|
1398
1413
|
]
|
|
1399
1414
|
if len(datetime_features) > 0:
|
|
1400
1415
|
self.logger.warning(self.bundle.get("dataset_date_features").format(datetime_features))
|
|
@@ -1479,6 +1494,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1479
1494
|
search_keys,
|
|
1480
1495
|
groups,
|
|
1481
1496
|
cv,
|
|
1497
|
+
columns_renaming,
|
|
1482
1498
|
)
|
|
1483
1499
|
|
|
1484
1500
|
@dataclass
|
|
@@ -1488,6 +1504,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1488
1504
|
enriched_X: pd.DataFrame
|
|
1489
1505
|
eval_set_sampled_dict: Dict[int, Tuple[pd.DataFrame, pd.Series]]
|
|
1490
1506
|
search_keys: Dict[str, SearchKey]
|
|
1507
|
+
columns_renaming: Dict[str, str]
|
|
1491
1508
|
|
|
1492
1509
|
def _sample_data_for_metrics(
|
|
1493
1510
|
self,
|
|
@@ -1527,11 +1544,15 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1527
1544
|
)
|
|
1528
1545
|
|
|
1529
1546
|
def __get_sampled_cached_enriched(self, exclude_features_sources: Optional[List[str]]) -> _SampledDataForMetrics:
|
|
1530
|
-
X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys =
|
|
1547
|
+
X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys, columns_renaming = (
|
|
1548
|
+
self.__cached_sampled_datasets
|
|
1549
|
+
)
|
|
1531
1550
|
if exclude_features_sources:
|
|
1532
1551
|
enriched_X = enriched_X.drop(columns=exclude_features_sources, errors="ignore")
|
|
1533
1552
|
|
|
1534
|
-
return self.__mk_sampled_data_tuple(
|
|
1553
|
+
return self.__mk_sampled_data_tuple(
|
|
1554
|
+
X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys, columns_renaming
|
|
1555
|
+
)
|
|
1535
1556
|
|
|
1536
1557
|
def __sample_only_input(
|
|
1537
1558
|
self, validated_X: pd.DataFrame, validated_y: pd.Series, eval_set: Optional[List[tuple]], is_demo_dataset: bool
|
|
@@ -1549,6 +1570,28 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1549
1570
|
eval_xy[EVAL_SET_INDEX] = idx + 1
|
|
1550
1571
|
df = pd.concat([df, eval_xy])
|
|
1551
1572
|
|
|
1573
|
+
search_keys = self.search_keys.copy()
|
|
1574
|
+
search_keys = self.__prepare_search_keys(df, search_keys, is_demo_dataset, is_transform=True, silent_mode=True)
|
|
1575
|
+
|
|
1576
|
+
date_column = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
|
|
1577
|
+
generated_features = []
|
|
1578
|
+
if date_column is not None:
|
|
1579
|
+
converter = DateTimeSearchKeyConverter(
|
|
1580
|
+
date_column, self.date_format, self.logger, self.bundle, silent_mode=True
|
|
1581
|
+
)
|
|
1582
|
+
df = converter.convert(df, keep_time=True)
|
|
1583
|
+
generated_features = converter.generated_features
|
|
1584
|
+
|
|
1585
|
+
email_columns = SearchKey.find_all_keys(search_keys, SearchKey.EMAIL)
|
|
1586
|
+
if email_columns:
|
|
1587
|
+
generator = EmailDomainGenerator(email_columns)
|
|
1588
|
+
df = generator.generate(df)
|
|
1589
|
+
generated_features.extend(generator.generated_features)
|
|
1590
|
+
|
|
1591
|
+
normalizer = Normalizer(self.search_keys, generated_features, self.bundle, self.logger, self.warning_counter)
|
|
1592
|
+
df = normalizer.normalize(df)
|
|
1593
|
+
columns_renaming = normalizer.columns_renaming
|
|
1594
|
+
|
|
1552
1595
|
df = clean_full_duplicates(df, logger=self.logger, silent=True, bundle=self.bundle)
|
|
1553
1596
|
|
|
1554
1597
|
num_samples = _num_samples(df)
|
|
@@ -1561,24 +1604,34 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1561
1604
|
self.logger.info(f"Downsampling from {num_samples} to {sample_rows}")
|
|
1562
1605
|
df = df.sample(n=sample_rows, random_state=self.random_state)
|
|
1563
1606
|
|
|
1564
|
-
|
|
1565
|
-
|
|
1607
|
+
df = self.__add_fit_system_record_id(df, search_keys, SYSTEM_RECORD_ID)
|
|
1608
|
+
if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
|
|
1609
|
+
df = df.drop(columns=DateTimeSearchKeyConverter.DATETIME_COL)
|
|
1566
1610
|
|
|
1567
|
-
train_df =
|
|
1611
|
+
train_df = df.query(f"{EVAL_SET_INDEX} == 0") if eval_set is not None else df
|
|
1568
1612
|
X_sampled = train_df.drop(columns=[TARGET, EVAL_SET_INDEX], errors="ignore")
|
|
1569
1613
|
y_sampled = train_df[TARGET].copy()
|
|
1570
1614
|
enriched_X = X_sampled
|
|
1571
1615
|
|
|
1572
1616
|
if eval_set is not None:
|
|
1573
1617
|
for idx in range(len(eval_set)):
|
|
1574
|
-
eval_xy_sampled =
|
|
1618
|
+
eval_xy_sampled = df.query(f"{EVAL_SET_INDEX} == {idx + 1}")
|
|
1575
1619
|
eval_X_sampled = eval_xy_sampled.drop(columns=[TARGET, EVAL_SET_INDEX], errors="ignore")
|
|
1576
1620
|
eval_y_sampled = eval_xy_sampled[TARGET].copy()
|
|
1577
1621
|
enriched_eval_X = eval_X_sampled
|
|
1578
1622
|
eval_set_sampled_dict[idx] = (eval_X_sampled, enriched_eval_X, eval_y_sampled)
|
|
1579
|
-
self.__cached_sampled_datasets = (
|
|
1623
|
+
self.__cached_sampled_datasets = (
|
|
1624
|
+
X_sampled,
|
|
1625
|
+
y_sampled,
|
|
1626
|
+
enriched_X,
|
|
1627
|
+
eval_set_sampled_dict,
|
|
1628
|
+
search_keys,
|
|
1629
|
+
columns_renaming,
|
|
1630
|
+
)
|
|
1580
1631
|
|
|
1581
|
-
return self.__mk_sampled_data_tuple(
|
|
1632
|
+
return self.__mk_sampled_data_tuple(
|
|
1633
|
+
X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys, columns_renaming
|
|
1634
|
+
)
|
|
1582
1635
|
|
|
1583
1636
|
def __sample_balanced(
|
|
1584
1637
|
self,
|
|
@@ -1590,7 +1643,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1590
1643
|
search_keys = self.fit_search_keys
|
|
1591
1644
|
|
|
1592
1645
|
rows_to_drop = None
|
|
1593
|
-
has_date =
|
|
1646
|
+
has_date = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME]) is not None
|
|
1594
1647
|
task_type = self.model_task_type or define_task(
|
|
1595
1648
|
self.df_with_original_index[TARGET], has_date, self.logger, silent=True
|
|
1596
1649
|
)
|
|
@@ -1644,9 +1697,18 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1644
1697
|
enriched_eval_X = enriched_eval_sets[idx + 1].drop(columns=[TARGET, EVAL_SET_INDEX])
|
|
1645
1698
|
eval_set_sampled_dict[idx] = (eval_X_sampled, enriched_eval_X, eval_y_sampled)
|
|
1646
1699
|
|
|
1647
|
-
self.__cached_sampled_datasets = (
|
|
1700
|
+
self.__cached_sampled_datasets = (
|
|
1701
|
+
X_sampled,
|
|
1702
|
+
y_sampled,
|
|
1703
|
+
enriched_X,
|
|
1704
|
+
eval_set_sampled_dict,
|
|
1705
|
+
search_keys,
|
|
1706
|
+
self.fit_columns_renaming,
|
|
1707
|
+
)
|
|
1648
1708
|
|
|
1649
|
-
return self.__mk_sampled_data_tuple(
|
|
1709
|
+
return self.__mk_sampled_data_tuple(
|
|
1710
|
+
X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys, self.fit_columns_renaming
|
|
1711
|
+
)
|
|
1650
1712
|
|
|
1651
1713
|
def __sample_imbalanced(
|
|
1652
1714
|
self,
|
|
@@ -1686,7 +1748,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1686
1748
|
tmp_target_name = "__target"
|
|
1687
1749
|
df = df.rename(columns={TARGET: tmp_target_name})
|
|
1688
1750
|
|
|
1689
|
-
enriched_df = self.__inner_transform(
|
|
1751
|
+
enriched_df, columns_renaming = self.__inner_transform(
|
|
1690
1752
|
trace_id,
|
|
1691
1753
|
df,
|
|
1692
1754
|
exclude_features_sources=exclude_features_sources,
|
|
@@ -1734,7 +1796,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1734
1796
|
tmp_target_name = "__target"
|
|
1735
1797
|
df = df.rename(columns={TARGET: tmp_target_name})
|
|
1736
1798
|
|
|
1737
|
-
enriched_Xy = self.__inner_transform(
|
|
1799
|
+
enriched_Xy, columns_renaming = self.__inner_transform(
|
|
1738
1800
|
trace_id,
|
|
1739
1801
|
df,
|
|
1740
1802
|
exclude_features_sources=exclude_features_sources,
|
|
@@ -1759,9 +1821,18 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1759
1821
|
y_sampled = enriched_Xy[TARGET].copy()
|
|
1760
1822
|
enriched_X = enriched_Xy.drop(columns=TARGET)
|
|
1761
1823
|
|
|
1762
|
-
self.__cached_sampled_datasets = (
|
|
1824
|
+
self.__cached_sampled_datasets = (
|
|
1825
|
+
X_sampled,
|
|
1826
|
+
y_sampled,
|
|
1827
|
+
enriched_X,
|
|
1828
|
+
eval_set_sampled_dict,
|
|
1829
|
+
self.search_keys,
|
|
1830
|
+
columns_renaming,
|
|
1831
|
+
)
|
|
1763
1832
|
|
|
1764
|
-
return self.__mk_sampled_data_tuple(
|
|
1833
|
+
return self.__mk_sampled_data_tuple(
|
|
1834
|
+
X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, self.search_keys, columns_renaming
|
|
1835
|
+
)
|
|
1765
1836
|
|
|
1766
1837
|
def __mk_sampled_data_tuple(
|
|
1767
1838
|
self,
|
|
@@ -1770,6 +1841,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1770
1841
|
enriched_X: pd.DataFrame,
|
|
1771
1842
|
eval_set_sampled_dict: Dict,
|
|
1772
1843
|
search_keys: Dict,
|
|
1844
|
+
columns_renaming: Dict[str, str],
|
|
1773
1845
|
):
|
|
1774
1846
|
search_keys = {k: v for k, v in search_keys.items() if k in X_sampled.columns.to_list()}
|
|
1775
1847
|
return FeaturesEnricher._SampledDataForMetrics(
|
|
@@ -1778,6 +1850,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1778
1850
|
enriched_X=enriched_X,
|
|
1779
1851
|
eval_set_sampled_dict=eval_set_sampled_dict,
|
|
1780
1852
|
search_keys=search_keys,
|
|
1853
|
+
columns_renaming=columns_renaming,
|
|
1781
1854
|
)
|
|
1782
1855
|
|
|
1783
1856
|
def get_search_id(self) -> Optional[str]:
|
|
@@ -1866,7 +1939,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1866
1939
|
progress_bar: Optional[ProgressBar] = None,
|
|
1867
1940
|
progress_callback: Optional[Callable[[SearchProgress], Any]] = None,
|
|
1868
1941
|
add_fit_system_record_id: bool = False,
|
|
1869
|
-
) -> pd.DataFrame:
|
|
1942
|
+
) -> Tuple[pd.DataFrame, Dict[str, str]]:
|
|
1870
1943
|
if self._search_task is None:
|
|
1871
1944
|
raise NotFittedError(self.bundle.get("transform_unfitted_enricher"))
|
|
1872
1945
|
|
|
@@ -1879,13 +1952,13 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1879
1952
|
|
|
1880
1953
|
if len(self.feature_names_) == 0:
|
|
1881
1954
|
self.logger.warning(self.bundle.get("no_important_features_for_transform"))
|
|
1882
|
-
return X
|
|
1955
|
+
return X, {c: c for c in X.columns}
|
|
1883
1956
|
|
|
1884
1957
|
if self._has_paid_features(exclude_features_sources):
|
|
1885
1958
|
msg = self.bundle.get("transform_with_paid_features")
|
|
1886
1959
|
self.logger.warning(msg)
|
|
1887
1960
|
self.__display_support_link(msg)
|
|
1888
|
-
return None
|
|
1961
|
+
return None, {c: c for c in X.columns}
|
|
1889
1962
|
|
|
1890
1963
|
if not metrics_calculation:
|
|
1891
1964
|
transform_usage = self.rest_client.get_current_transform_usage(trace_id)
|
|
@@ -1896,7 +1969,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1896
1969
|
self.logger.warning(msg)
|
|
1897
1970
|
print(msg)
|
|
1898
1971
|
show_request_quote_button()
|
|
1899
|
-
return None
|
|
1972
|
+
return None, {c: c for c in X.columns}
|
|
1900
1973
|
else:
|
|
1901
1974
|
msg = self.bundle.get("transform_usage_info").format(
|
|
1902
1975
|
transform_usage.limit, transform_usage.transformed_rows
|
|
@@ -1934,9 +2007,11 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1934
2007
|
df = self.__add_country_code(df, search_keys)
|
|
1935
2008
|
|
|
1936
2009
|
generated_features = []
|
|
1937
|
-
date_column =
|
|
2010
|
+
date_column = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
|
|
1938
2011
|
if date_column is not None:
|
|
1939
|
-
converter = DateTimeSearchKeyConverter(
|
|
2012
|
+
converter = DateTimeSearchKeyConverter(
|
|
2013
|
+
date_column, self.date_format, self.logger, bundle=self.bundle, silent_mode=silent_mode
|
|
2014
|
+
)
|
|
1940
2015
|
df = converter.convert(df)
|
|
1941
2016
|
self.logger.info(f"Date column after convertion: {df[date_column]}")
|
|
1942
2017
|
generated_features.extend(converter.generated_features)
|
|
@@ -1945,61 +2020,93 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1945
2020
|
if self.add_date_if_missing:
|
|
1946
2021
|
df = self._add_current_date_as_key(df, search_keys, self.logger, self.bundle)
|
|
1947
2022
|
|
|
2023
|
+
email_columns = SearchKey.find_all_keys(search_keys, SearchKey.EMAIL)
|
|
2024
|
+
if email_columns:
|
|
2025
|
+
generator = EmailDomainGenerator(email_columns)
|
|
2026
|
+
df = generator.generate(df)
|
|
2027
|
+
generated_features.extend(generator.generated_features)
|
|
2028
|
+
|
|
2029
|
+
normalizer = Normalizer(
|
|
2030
|
+
search_keys, generated_features, self.bundle, self.logger, self.warning_counter, silent_mode
|
|
2031
|
+
)
|
|
2032
|
+
df = normalizer.normalize(df)
|
|
2033
|
+
columns_renaming = normalizer.columns_renaming
|
|
2034
|
+
|
|
1948
2035
|
# Don't pass all features in backend on transform
|
|
1949
|
-
original_features_for_transform = []
|
|
1950
2036
|
runtime_parameters = self._get_copy_of_runtime_parameters()
|
|
1951
|
-
|
|
1952
|
-
if len(
|
|
1953
|
-
|
|
1954
|
-
features_for_transform = self._search_task.get_features_for_transform()
|
|
1955
|
-
if features_for_transform is not None and len(features_for_transform) > 0:
|
|
1956
|
-
file_metadata = self._search_task.get_file_metadata(trace_id)
|
|
1957
|
-
original_features_for_transform = [
|
|
1958
|
-
c.originalName or c.name for c in file_metadata.columns if c.name in features_for_transform
|
|
1959
|
-
]
|
|
2037
|
+
features_for_transform = self._search_task.get_features_for_transform() or []
|
|
2038
|
+
if len(features_for_transform) > 0:
|
|
2039
|
+
runtime_parameters.properties["features_for_embeddings"] = ",".join(features_for_transform)
|
|
1960
2040
|
|
|
1961
|
-
|
|
1962
|
-
|
|
1963
|
-
columns_for_system_record_id = sorted(list(search_keys.keys()) + (original_features_for_transform))
|
|
2041
|
+
columns_for_system_record_id = sorted(list(search_keys.keys()) + features_for_transform)
|
|
1964
2042
|
|
|
1965
2043
|
df[ENTITY_SYSTEM_RECORD_ID] = pd.util.hash_pandas_object(
|
|
1966
2044
|
df[columns_for_system_record_id], index=False
|
|
1967
2045
|
).astype("Float64")
|
|
1968
2046
|
|
|
1969
2047
|
# Explode multiple search keys
|
|
1970
|
-
df, unnest_search_keys = self._explode_multiple_search_keys(df, search_keys)
|
|
2048
|
+
df, unnest_search_keys = self._explode_multiple_search_keys(df, search_keys, columns_renaming)
|
|
1971
2049
|
|
|
1972
2050
|
email_column = self._get_email_column(search_keys)
|
|
1973
2051
|
hem_column = self._get_hem_column(search_keys)
|
|
1974
|
-
email_converted_to_hem = False
|
|
1975
2052
|
if email_column:
|
|
1976
2053
|
converter = EmailSearchKeyConverter(
|
|
1977
|
-
email_column,
|
|
2054
|
+
email_column,
|
|
2055
|
+
hem_column,
|
|
2056
|
+
search_keys,
|
|
2057
|
+
columns_renaming,
|
|
2058
|
+
list(unnest_search_keys.keys()),
|
|
2059
|
+
self.logger,
|
|
1978
2060
|
)
|
|
1979
2061
|
df = converter.convert(df)
|
|
1980
|
-
|
|
1981
|
-
|
|
2062
|
+
|
|
2063
|
+
ip_column = self._get_ip_column(search_keys)
|
|
2064
|
+
if ip_column:
|
|
2065
|
+
converter = IpSearchKeyConverter(
|
|
2066
|
+
ip_column,
|
|
2067
|
+
search_keys,
|
|
2068
|
+
columns_renaming,
|
|
2069
|
+
list(unnest_search_keys.keys()),
|
|
2070
|
+
self.bundle,
|
|
2071
|
+
self.logger,
|
|
2072
|
+
)
|
|
2073
|
+
df = converter.convert(df)
|
|
2074
|
+
|
|
2075
|
+
phone_column = self._get_phone_column(search_keys)
|
|
2076
|
+
country_column = self._get_country_column(search_keys)
|
|
2077
|
+
if phone_column:
|
|
2078
|
+
converter = PhoneSearchKeyConverter(phone_column, country_column)
|
|
2079
|
+
df = converter.convert(df)
|
|
2080
|
+
|
|
2081
|
+
if country_column:
|
|
2082
|
+
converter = CountrySearchKeyConverter(country_column)
|
|
2083
|
+
df = converter.convert(df)
|
|
2084
|
+
|
|
2085
|
+
postal_code = self._get_postal_column(search_keys)
|
|
2086
|
+
if postal_code:
|
|
2087
|
+
converter = PostalCodeSearchKeyConverter(postal_code)
|
|
2088
|
+
df = converter.convert(df)
|
|
2089
|
+
|
|
1982
2090
|
generated_features = [f for f in generated_features if f in self.fit_generated_features]
|
|
1983
2091
|
|
|
1984
2092
|
meaning_types = {col: key.value for col, key in search_keys.items()}
|
|
1985
|
-
|
|
1986
|
-
for col in original_features_for_transform:
|
|
2093
|
+
for col in features_for_transform:
|
|
1987
2094
|
meaning_types[col] = FileColumnMeaningType.FEATURE
|
|
1988
|
-
features_not_to_pass = [
|
|
1989
|
-
|
|
1990
|
-
|
|
1991
|
-
|
|
1992
|
-
|
|
1993
|
-
features_not_to_pass = [c for c in features_not_to_pass if c not in original_features_for_transform]
|
|
1994
|
-
columns_for_system_record_id = sorted(list(search_keys.keys()) + (original_features_for_transform))
|
|
2095
|
+
features_not_to_pass = [
|
|
2096
|
+
c
|
|
2097
|
+
for c in df.columns
|
|
2098
|
+
if c not in search_keys.keys() and c not in features_for_transform and c != ENTITY_SYSTEM_RECORD_ID
|
|
2099
|
+
]
|
|
1995
2100
|
|
|
1996
2101
|
if add_fit_system_record_id:
|
|
1997
|
-
df = self.__add_fit_system_record_id(df,
|
|
2102
|
+
df = self.__add_fit_system_record_id(df, search_keys, SYSTEM_RECORD_ID)
|
|
2103
|
+
if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
|
|
2104
|
+
df = df.drop(columns=DateTimeSearchKeyConverter.DATETIME_COL)
|
|
1998
2105
|
df = df.rename(columns={SYSTEM_RECORD_ID: SORT_ID})
|
|
1999
2106
|
features_not_to_pass.append(SORT_ID)
|
|
2000
2107
|
|
|
2001
|
-
|
|
2002
|
-
|
|
2108
|
+
# search keys might be changed after explode
|
|
2109
|
+
columns_for_system_record_id = sorted(list(search_keys.keys()) + features_for_transform)
|
|
2003
2110
|
df[SYSTEM_RECORD_ID] = pd.util.hash_pandas_object(df[columns_for_system_record_id], index=False).astype(
|
|
2004
2111
|
"Float64"
|
|
2005
2112
|
)
|
|
@@ -2035,8 +2142,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2035
2142
|
rest_client=self.rest_client,
|
|
2036
2143
|
logger=self.logger,
|
|
2037
2144
|
)
|
|
2038
|
-
|
|
2039
|
-
dataset.ignore_columns = [email_column]
|
|
2145
|
+
dataset.columns_renaming = columns_renaming
|
|
2040
2146
|
|
|
2041
2147
|
if max_features is not None or importance_threshold is not None:
|
|
2042
2148
|
exclude_features_sources = list(
|
|
@@ -2125,7 +2231,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2125
2231
|
result = enrich()
|
|
2126
2232
|
|
|
2127
2233
|
filtered_columns = self.__filtered_enriched_features(importance_threshold, max_features)
|
|
2128
|
-
existing_filtered_columns = [
|
|
2234
|
+
existing_filtered_columns = [
|
|
2235
|
+
c for c in filtered_columns if c in result.columns and c not in validated_X.columns
|
|
2236
|
+
]
|
|
2129
2237
|
selecting_columns = validated_X.columns.tolist() + generated_features + existing_filtered_columns
|
|
2130
2238
|
if add_fit_system_record_id:
|
|
2131
2239
|
selecting_columns.append(SORT_ID)
|
|
@@ -2138,7 +2246,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2138
2246
|
if add_fit_system_record_id:
|
|
2139
2247
|
result = result.rename(columns={SORT_ID: SYSTEM_RECORD_ID})
|
|
2140
2248
|
|
|
2141
|
-
return result
|
|
2249
|
+
return result, columns_renaming
|
|
2142
2250
|
|
|
2143
2251
|
def _get_excluded_features(self, max_features: Optional[int], importance_threshold: Optional[float]) -> List[str]:
|
|
2144
2252
|
features_info = self._internal_features_info
|
|
@@ -2239,6 +2347,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2239
2347
|
self.df_with_original_index = None
|
|
2240
2348
|
self.__cached_sampled_datasets = None
|
|
2241
2349
|
self.metrics = None
|
|
2350
|
+
self.fit_columns_renaming = None
|
|
2351
|
+
self.fit_dropped_features = set()
|
|
2352
|
+
self.fit_generated_features = []
|
|
2242
2353
|
|
|
2243
2354
|
validated_X = self._validate_X(X)
|
|
2244
2355
|
validated_y = self._validate_y(validated_X, y)
|
|
@@ -2285,9 +2396,10 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2285
2396
|
self.fit_search_keys = self.search_keys.copy()
|
|
2286
2397
|
self.fit_search_keys = self.__prepare_search_keys(validated_X, self.fit_search_keys, is_demo_dataset)
|
|
2287
2398
|
|
|
2288
|
-
maybe_date_column =
|
|
2399
|
+
maybe_date_column = SearchKey.find_key(self.fit_search_keys, [SearchKey.DATE, SearchKey.DATETIME])
|
|
2289
2400
|
has_date = maybe_date_column is not None
|
|
2290
2401
|
model_task_type = self.model_task_type or define_task(validated_y, has_date, self.logger)
|
|
2402
|
+
|
|
2291
2403
|
self._validate_binary_observations(validated_y, model_task_type)
|
|
2292
2404
|
|
|
2293
2405
|
self.runtime_parameters = get_runtime_params_custom_loss(
|
|
@@ -2317,7 +2429,13 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2317
2429
|
self.fit_generated_features = []
|
|
2318
2430
|
|
|
2319
2431
|
if has_date:
|
|
2320
|
-
converter = DateTimeSearchKeyConverter(
|
|
2432
|
+
converter = DateTimeSearchKeyConverter(
|
|
2433
|
+
maybe_date_column,
|
|
2434
|
+
self.date_format,
|
|
2435
|
+
self.logger,
|
|
2436
|
+
bundle=self.bundle,
|
|
2437
|
+
warnings_counter=self.warning_counter,
|
|
2438
|
+
)
|
|
2321
2439
|
df = converter.convert(df, keep_time=True)
|
|
2322
2440
|
self.logger.info(f"Date column after convertion: {df[maybe_date_column]}")
|
|
2323
2441
|
self.fit_generated_features.extend(converter.generated_features)
|
|
@@ -2326,6 +2444,12 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2326
2444
|
if self.add_date_if_missing:
|
|
2327
2445
|
df = self._add_current_date_as_key(df, self.fit_search_keys, self.logger, self.bundle)
|
|
2328
2446
|
|
|
2447
|
+
email_columns = SearchKey.find_all_keys(self.fit_search_keys, SearchKey.EMAIL)
|
|
2448
|
+
if email_columns:
|
|
2449
|
+
generator = EmailDomainGenerator(email_columns)
|
|
2450
|
+
df = generator.generate(df)
|
|
2451
|
+
self.fit_generated_features.extend(generator.generated_features)
|
|
2452
|
+
|
|
2329
2453
|
# Checks that need validated date
|
|
2330
2454
|
validate_dates_distribution(df, self.fit_search_keys, self.logger, self.bundle, self.warning_counter)
|
|
2331
2455
|
|
|
@@ -2334,7 +2458,12 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2334
2458
|
|
|
2335
2459
|
self.__adjust_cv(df, maybe_date_column, model_task_type)
|
|
2336
2460
|
|
|
2337
|
-
|
|
2461
|
+
normalizer = Normalizer(
|
|
2462
|
+
self.fit_search_keys, self.fit_generated_features, self.bundle, self.logger, self.warning_counter
|
|
2463
|
+
)
|
|
2464
|
+
df = normalizer.normalize(df)
|
|
2465
|
+
columns_renaming = normalizer.columns_renaming
|
|
2466
|
+
self.fit_columns_renaming = columns_renaming
|
|
2338
2467
|
|
|
2339
2468
|
df = remove_fintech_duplicates(
|
|
2340
2469
|
df, self.fit_search_keys, date_format=self.date_format, logger=self.logger, bundle=self.bundle
|
|
@@ -2342,38 +2471,58 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2342
2471
|
df = clean_full_duplicates(df, self.logger, bundle=self.bundle)
|
|
2343
2472
|
|
|
2344
2473
|
# Explode multiple search keys
|
|
2345
|
-
|
|
2346
|
-
meaning_types = {
|
|
2347
|
-
**{col: key.value for col, key in self.fit_search_keys.items()},
|
|
2348
|
-
**{str(c): FileColumnMeaningType.FEATURE for c in df.columns if c not in non_feature_columns},
|
|
2349
|
-
}
|
|
2350
|
-
meaning_types[self.TARGET_NAME] = FileColumnMeaningType.TARGET
|
|
2351
|
-
if eval_set is not None and len(eval_set) > 0:
|
|
2352
|
-
meaning_types[EVAL_SET_INDEX] = FileColumnMeaningType.EVAL_SET_INDEX
|
|
2353
|
-
df = self.__add_fit_system_record_id(df, meaning_types, self.fit_search_keys, ENTITY_SYSTEM_RECORD_ID)
|
|
2474
|
+
df = self.__add_fit_system_record_id(df, self.fit_search_keys, ENTITY_SYSTEM_RECORD_ID)
|
|
2354
2475
|
|
|
2355
2476
|
# TODO check that this is correct for enrichment
|
|
2356
2477
|
self.df_with_original_index = df.copy()
|
|
2478
|
+
# TODO check maybe need to drop _time column from df_with_original_index
|
|
2357
2479
|
|
|
2358
|
-
df, unnest_search_keys = self._explode_multiple_search_keys(df, self.fit_search_keys)
|
|
2480
|
+
df, unnest_search_keys = self._explode_multiple_search_keys(df, self.fit_search_keys, columns_renaming)
|
|
2359
2481
|
|
|
2360
2482
|
# Convert EMAIL to HEM after unnesting to do it only with one column
|
|
2361
2483
|
email_column = self._get_email_column(self.fit_search_keys)
|
|
2362
2484
|
hem_column = self._get_hem_column(self.fit_search_keys)
|
|
2363
|
-
email_converted_to_hem = False
|
|
2364
2485
|
if email_column:
|
|
2365
2486
|
converter = EmailSearchKeyConverter(
|
|
2366
|
-
email_column,
|
|
2487
|
+
email_column,
|
|
2488
|
+
hem_column,
|
|
2489
|
+
self.fit_search_keys,
|
|
2490
|
+
columns_renaming,
|
|
2491
|
+
list(unnest_search_keys.keys()),
|
|
2492
|
+
self.logger,
|
|
2367
2493
|
)
|
|
2368
2494
|
df = converter.convert(df)
|
|
2369
|
-
|
|
2370
|
-
|
|
2495
|
+
|
|
2496
|
+
ip_column = self._get_ip_column(self.fit_search_keys)
|
|
2497
|
+
if ip_column:
|
|
2498
|
+
converter = IpSearchKeyConverter(
|
|
2499
|
+
ip_column,
|
|
2500
|
+
self.fit_search_keys,
|
|
2501
|
+
columns_renaming,
|
|
2502
|
+
list(unnest_search_keys.keys()),
|
|
2503
|
+
self.bundle,
|
|
2504
|
+
self.logger,
|
|
2505
|
+
)
|
|
2506
|
+
df = converter.convert(df)
|
|
2507
|
+
|
|
2508
|
+
phone_column = self._get_phone_column(self.fit_search_keys)
|
|
2509
|
+
country_column = self._get_country_column(self.fit_search_keys)
|
|
2510
|
+
if phone_column:
|
|
2511
|
+
converter = PhoneSearchKeyConverter(phone_column, country_column)
|
|
2512
|
+
df = converter.convert(df)
|
|
2513
|
+
|
|
2514
|
+
if country_column:
|
|
2515
|
+
converter = CountrySearchKeyConverter(country_column)
|
|
2516
|
+
df = converter.convert(df)
|
|
2517
|
+
|
|
2518
|
+
postal_code = self._get_postal_column(self.fit_search_keys)
|
|
2519
|
+
if postal_code:
|
|
2520
|
+
converter = PostalCodeSearchKeyConverter(postal_code)
|
|
2521
|
+
df = converter.convert(df)
|
|
2371
2522
|
|
|
2372
2523
|
non_feature_columns = [self.TARGET_NAME, EVAL_SET_INDEX, ENTITY_SYSTEM_RECORD_ID, SEARCH_KEY_UNNEST] + list(
|
|
2373
2524
|
self.fit_search_keys.keys()
|
|
2374
2525
|
)
|
|
2375
|
-
if email_converted_to_hem:
|
|
2376
|
-
non_feature_columns.append(email_column)
|
|
2377
2526
|
if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
|
|
2378
2527
|
non_feature_columns.append(DateTimeSearchKeyConverter.DATETIME_COL)
|
|
2379
2528
|
|
|
@@ -2385,9 +2534,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2385
2534
|
self.fit_dropped_features.update(features_to_drop)
|
|
2386
2535
|
df = df.drop(columns=features_to_drop)
|
|
2387
2536
|
|
|
2388
|
-
if email_converted_to_hem:
|
|
2389
|
-
self.fit_dropped_features.add(email_column)
|
|
2390
|
-
|
|
2391
2537
|
self.fit_generated_features = [f for f in self.fit_generated_features if f not in self.fit_dropped_features]
|
|
2392
2538
|
|
|
2393
2539
|
meaning_types = {
|
|
@@ -2401,7 +2547,12 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2401
2547
|
if eval_set is not None and len(eval_set) > 0:
|
|
2402
2548
|
meaning_types[EVAL_SET_INDEX] = FileColumnMeaningType.EVAL_SET_INDEX
|
|
2403
2549
|
|
|
2404
|
-
df = self.__add_fit_system_record_id(df,
|
|
2550
|
+
df = self.__add_fit_system_record_id(df, self.fit_search_keys, SYSTEM_RECORD_ID)
|
|
2551
|
+
|
|
2552
|
+
if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
|
|
2553
|
+
df = df.drop(columns=DateTimeSearchKeyConverter.DATETIME_COL)
|
|
2554
|
+
|
|
2555
|
+
meaning_types[SYSTEM_RECORD_ID] = FileColumnMeaningType.SYSTEM_RECORD_ID
|
|
2405
2556
|
|
|
2406
2557
|
df = df.reset_index(drop=True).sort_values(by=SYSTEM_RECORD_ID).reset_index(drop=True)
|
|
2407
2558
|
|
|
@@ -2419,8 +2570,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2419
2570
|
rest_client=self.rest_client,
|
|
2420
2571
|
logger=self.logger,
|
|
2421
2572
|
)
|
|
2422
|
-
|
|
2423
|
-
dataset.ignore_columns = [email_column]
|
|
2573
|
+
dataset.columns_renaming = columns_renaming
|
|
2424
2574
|
|
|
2425
2575
|
self.passed_features = [
|
|
2426
2576
|
column for column, meaning_type in meaning_types.items() if meaning_type == FileColumnMeaningType.FEATURE
|
|
@@ -2809,7 +2959,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2809
2959
|
if DateTimeSearchKeyConverter.DATETIME_COL in X.columns:
|
|
2810
2960
|
date_column = DateTimeSearchKeyConverter.DATETIME_COL
|
|
2811
2961
|
else:
|
|
2812
|
-
date_column =
|
|
2962
|
+
date_column = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
|
|
2813
2963
|
sort_columns = [date_column] if date_column is not None else []
|
|
2814
2964
|
|
|
2815
2965
|
# Xy = pd.concat([X, y], axis=1)
|
|
@@ -2905,10 +3055,10 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2905
3055
|
|
|
2906
3056
|
do_without_pandas_limits(print_datasets_sample)
|
|
2907
3057
|
|
|
2908
|
-
maybe_date_col =
|
|
3058
|
+
maybe_date_col = SearchKey.find_key(self.search_keys, [SearchKey.DATE, SearchKey.DATETIME])
|
|
2909
3059
|
if X is not None and maybe_date_col is not None and maybe_date_col in X.columns:
|
|
2910
3060
|
# TODO cast date column to single dtype
|
|
2911
|
-
date_converter = DateTimeSearchKeyConverter(maybe_date_col, self.date_format)
|
|
3061
|
+
date_converter = DateTimeSearchKeyConverter(maybe_date_col, self.date_format, silent_mode=True)
|
|
2912
3062
|
converted_X = date_converter.convert(X)
|
|
2913
3063
|
min_date = converted_X[maybe_date_col].min()
|
|
2914
3064
|
max_date = converted_X[maybe_date_col].max()
|
|
@@ -2935,12 +3085,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2935
3085
|
|
|
2936
3086
|
return df
|
|
2937
3087
|
|
|
2938
|
-
@staticmethod
|
|
2939
|
-
def _get_date_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
|
|
2940
|
-
for col, t in search_keys.items():
|
|
2941
|
-
if t in [SearchKey.DATE, SearchKey.DATETIME]:
|
|
2942
|
-
return col
|
|
2943
|
-
|
|
2944
3088
|
@staticmethod
|
|
2945
3089
|
def _add_current_date_as_key(
|
|
2946
3090
|
df: pd.DataFrame, search_keys: Dict[str, SearchKey], logger: logging.Logger, bundle: ResourceBundle
|
|
@@ -2956,7 +3100,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2956
3100
|
logger.warning(msg)
|
|
2957
3101
|
df[FeaturesEnricher.CURRENT_DATE] = datetime.date.today()
|
|
2958
3102
|
search_keys[FeaturesEnricher.CURRENT_DATE] = SearchKey.DATE
|
|
2959
|
-
converter = DateTimeSearchKeyConverter(FeaturesEnricher.CURRENT_DATE,
|
|
3103
|
+
converter = DateTimeSearchKeyConverter(FeaturesEnricher.CURRENT_DATE, silent_mode=True)
|
|
2960
3104
|
df = converter.convert(df)
|
|
2961
3105
|
return df
|
|
2962
3106
|
|
|
@@ -2984,17 +3128,37 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2984
3128
|
if len(cols) == 1:
|
|
2985
3129
|
return cols[0]
|
|
2986
3130
|
|
|
3131
|
+
@staticmethod
|
|
3132
|
+
def _get_ip_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
|
|
3133
|
+
cols = [col for col, t in search_keys.items() if t == SearchKey.IP]
|
|
3134
|
+
if len(cols) > 1:
|
|
3135
|
+
raise Exception("More than one ip column found after unnest")
|
|
3136
|
+
if len(cols) == 1:
|
|
3137
|
+
return cols[0]
|
|
3138
|
+
|
|
2987
3139
|
@staticmethod
|
|
2988
3140
|
def _get_phone_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
|
|
2989
3141
|
for col, t in search_keys.items():
|
|
2990
3142
|
if t == SearchKey.PHONE:
|
|
2991
3143
|
return col
|
|
2992
3144
|
|
|
3145
|
+
@staticmethod
|
|
3146
|
+
def _get_country_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
|
|
3147
|
+
for col, t in search_keys.items():
|
|
3148
|
+
if t == SearchKey.COUNTRY:
|
|
3149
|
+
return col
|
|
3150
|
+
|
|
3151
|
+
@staticmethod
|
|
3152
|
+
def _get_postal_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
|
|
3153
|
+
for col, t in search_keys.items():
|
|
3154
|
+
if t == SearchKey.POSTAL_CODE:
|
|
3155
|
+
return col
|
|
3156
|
+
|
|
2993
3157
|
def _explode_multiple_search_keys(
|
|
2994
|
-
self, df: pd.DataFrame, search_keys: Dict[str, SearchKey]
|
|
3158
|
+
self, df: pd.DataFrame, search_keys: Dict[str, SearchKey], columns_renaming: Dict[str, str]
|
|
2995
3159
|
) -> Tuple[pd.DataFrame, Dict[str, List[str]]]:
|
|
2996
3160
|
# find groups of multiple search keys
|
|
2997
|
-
search_key_names_by_type: Dict[SearchKey, str] = {}
|
|
3161
|
+
search_key_names_by_type: Dict[SearchKey, List[str]] = {}
|
|
2998
3162
|
for key_name, key_type in search_keys.items():
|
|
2999
3163
|
search_key_names_by_type[key_type] = search_key_names_by_type.get(key_type, []) + [key_name]
|
|
3000
3164
|
search_key_names_by_type = {
|
|
@@ -3003,6 +3167,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3003
3167
|
if len(search_key_names_by_type) == 0:
|
|
3004
3168
|
return df, {}
|
|
3005
3169
|
|
|
3170
|
+
self.logger.info(f"Start exploding dataset by {search_key_names_by_type}. Size before: {len(df)}")
|
|
3006
3171
|
multiple_keys_columns = [col for cols in search_key_names_by_type.values() for col in cols]
|
|
3007
3172
|
other_columns = [col for col in df.columns if col not in multiple_keys_columns]
|
|
3008
3173
|
exploded_dfs = []
|
|
@@ -3018,14 +3183,16 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3018
3183
|
del search_keys[old_key]
|
|
3019
3184
|
search_keys[new_search_key] = key_type
|
|
3020
3185
|
unnest_search_keys[new_search_key] = key_names
|
|
3186
|
+
columns_renaming[new_search_key] = new_search_key
|
|
3021
3187
|
|
|
3022
3188
|
df = pd.concat(exploded_dfs, ignore_index=True)
|
|
3189
|
+
self.logger.info(f"Finished explosion. Size after: {len(df)}")
|
|
3023
3190
|
return df, unnest_search_keys
|
|
3024
3191
|
|
|
3025
3192
|
def __add_fit_system_record_id(
|
|
3026
3193
|
self,
|
|
3027
3194
|
df: pd.DataFrame,
|
|
3028
|
-
meaning_types: Dict[str, FileColumnMeaningType],
|
|
3195
|
+
# meaning_types: Dict[str, FileColumnMeaningType],
|
|
3029
3196
|
search_keys: Dict[str, SearchKey],
|
|
3030
3197
|
id_name: str,
|
|
3031
3198
|
) -> pd.DataFrame:
|
|
@@ -3048,39 +3215,37 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3048
3215
|
]
|
|
3049
3216
|
if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
|
|
3050
3217
|
date_column = DateTimeSearchKeyConverter.DATETIME_COL
|
|
3051
|
-
sort_exclude_columns.append(
|
|
3218
|
+
sort_exclude_columns.append(SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME]))
|
|
3052
3219
|
else:
|
|
3053
|
-
date_column =
|
|
3220
|
+
date_column = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
|
|
3054
3221
|
sort_columns = [date_column] if date_column is not None else []
|
|
3055
3222
|
|
|
3223
|
+
sorted_other_keys = sorted(search_keys, key=lambda x: str(search_keys.get(x)))
|
|
3224
|
+
sorted_other_keys = [k for k in sorted_other_keys if k not in sort_exclude_columns]
|
|
3225
|
+
|
|
3056
3226
|
other_columns = sorted(
|
|
3057
3227
|
[
|
|
3058
3228
|
c
|
|
3059
3229
|
for c in df.columns
|
|
3060
|
-
if c not in sort_columns
|
|
3230
|
+
if c not in sort_columns
|
|
3231
|
+
and c not in sorted_other_keys
|
|
3232
|
+
and c not in sort_exclude_columns
|
|
3233
|
+
and df[c].nunique() > 1
|
|
3061
3234
|
]
|
|
3062
|
-
# [
|
|
3063
|
-
# sk
|
|
3064
|
-
# for sk, key_type in search_keys.items()
|
|
3065
|
-
# if key_type not in [SearchKey.DATE, SearchKey.DATETIME]
|
|
3066
|
-
# and sk in df.columns
|
|
3067
|
-
# and df[sk].nunique() > 1 # don't use constant keys for hash
|
|
3068
|
-
# ]
|
|
3069
3235
|
)
|
|
3070
3236
|
|
|
3237
|
+
all_other_columns = sorted_other_keys + other_columns
|
|
3238
|
+
|
|
3071
3239
|
search_keys_hash = "search_keys_hash"
|
|
3072
|
-
if len(
|
|
3240
|
+
if len(all_other_columns) > 0:
|
|
3073
3241
|
sort_columns.append(search_keys_hash)
|
|
3074
|
-
df[search_keys_hash] = pd.util.hash_pandas_object(df[
|
|
3242
|
+
df[search_keys_hash] = pd.util.hash_pandas_object(df[all_other_columns], index=False)
|
|
3075
3243
|
|
|
3076
3244
|
df = df.sort_values(by=sort_columns)
|
|
3077
3245
|
|
|
3078
3246
|
if search_keys_hash in df.columns:
|
|
3079
3247
|
df.drop(columns=search_keys_hash, inplace=True)
|
|
3080
3248
|
|
|
3081
|
-
if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
|
|
3082
|
-
df.drop(columns=DateTimeSearchKeyConverter.DATETIME_COL, inplace=True)
|
|
3083
|
-
|
|
3084
3249
|
df = df.reset_index(drop=True).reset_index()
|
|
3085
3250
|
# system_record_id saves correct order for fit
|
|
3086
3251
|
df = df.rename(columns={DEFAULT_INDEX: id_name})
|
|
@@ -3090,11 +3255,11 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3090
3255
|
df.index.name = original_index_name
|
|
3091
3256
|
df = df.sort_values(by=original_order_name).drop(columns=original_order_name)
|
|
3092
3257
|
|
|
3093
|
-
meaning_types[id_name] = (
|
|
3094
|
-
|
|
3095
|
-
|
|
3096
|
-
|
|
3097
|
-
)
|
|
3258
|
+
# meaning_types[id_name] = (
|
|
3259
|
+
# FileColumnMeaningType.SYSTEM_RECORD_ID
|
|
3260
|
+
# if id_name == SYSTEM_RECORD_ID
|
|
3261
|
+
# else FileColumnMeaningType.ENTITY_SYSTEM_RECORD_ID
|
|
3262
|
+
# )
|
|
3098
3263
|
return df
|
|
3099
3264
|
|
|
3100
3265
|
def __correct_target(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
@@ -3472,10 +3637,13 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3472
3637
|
for _, key_type in search_keys.items():
|
|
3473
3638
|
if not isinstance(key_type, SearchKey):
|
|
3474
3639
|
raise ValidationError(self.bundle.get("unsupported_type_of_search_key").format(key_type))
|
|
3640
|
+
|
|
3475
3641
|
valid_search_keys = {}
|
|
3476
3642
|
unsupported_search_keys = {
|
|
3477
3643
|
SearchKey.IP_RANGE_FROM,
|
|
3478
3644
|
SearchKey.IP_RANGE_TO,
|
|
3645
|
+
SearchKey.IPV6_RANGE_FROM,
|
|
3646
|
+
SearchKey.IPV6_RANGE_TO,
|
|
3479
3647
|
SearchKey.MSISDN_RANGE_FROM,
|
|
3480
3648
|
SearchKey.MSISDN_RANGE_TO,
|
|
3481
3649
|
# SearchKey.EMAIL_ONE_DOMAIN,
|
|
@@ -3565,6 +3733,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3565
3733
|
print(msg)
|
|
3566
3734
|
self.logger.warning(msg)
|
|
3567
3735
|
self.warning_counter.increment()
|
|
3736
|
+
# TODO maybe raise ValidationError
|
|
3568
3737
|
|
|
3569
3738
|
self.logger.info(f"Prepared search keys: {valid_search_keys}")
|
|
3570
3739
|
|