upgini 1.1.244a24__py3-none-any.whl → 1.1.245a1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- upgini/dataset.py +67 -55
- upgini/features_enricher.py +202 -186
- upgini/metrics.py +1 -0
- upgini/resource_bundle/__init__.py +14 -1
- upgini/utils/target_utils.py +8 -2
- {upgini-1.1.244a24.dist-info → upgini-1.1.245a1.dist-info}/METADATA +7 -7
- {upgini-1.1.244a24.dist-info → upgini-1.1.245a1.dist-info}/RECORD +10 -10
- {upgini-1.1.244a24.dist-info → upgini-1.1.245a1.dist-info}/WHEEL +1 -1
- {upgini-1.1.244a24.dist-info → upgini-1.1.245a1.dist-info}/LICENSE +0 -0
- {upgini-1.1.244a24.dist-info → upgini-1.1.245a1.dist-info}/top_level.txt +0 -0
upgini/features_enricher.py
CHANGED
|
@@ -50,7 +50,7 @@ from upgini.metadata import (
|
|
|
50
50
|
SearchKey,
|
|
51
51
|
)
|
|
52
52
|
from upgini.metrics import EstimatorWrapper, validate_scoring_argument
|
|
53
|
-
from upgini.resource_bundle import bundle
|
|
53
|
+
from upgini.resource_bundle import ResourceBundle, get_custom_bundle, bundle
|
|
54
54
|
from upgini.search_task import SearchTask
|
|
55
55
|
from upgini.spinner import Spinner
|
|
56
56
|
from upgini.utils import combine_search_keys
|
|
@@ -186,8 +186,10 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
186
186
|
baseline_score_column: Optional[Any] = None,
|
|
187
187
|
client_ip: Optional[str] = None,
|
|
188
188
|
client_visitorid: Optional[str] = None,
|
|
189
|
+
custom_bundle_config: Optional[str] = None,
|
|
189
190
|
**kwargs,
|
|
190
191
|
):
|
|
192
|
+
self.bundle = get_custom_bundle(custom_bundle_config)
|
|
191
193
|
self._api_key = api_key or os.environ.get(UPGINI_API_KEY)
|
|
192
194
|
if api_key is not None and not isinstance(api_key, str):
|
|
193
195
|
raise ValidationError(f"api_key should be `string`, but passed: `{api_key}`")
|
|
@@ -240,23 +242,23 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
240
242
|
if search_id:
|
|
241
243
|
search_task = SearchTask(search_id, rest_client=self.rest_client, logger=self.logger)
|
|
242
244
|
|
|
243
|
-
print(bundle.get("search_by_task_id_start"))
|
|
245
|
+
print(self.bundle.get("search_by_task_id_start"))
|
|
244
246
|
trace_id = str(uuid.uuid4())
|
|
245
247
|
with MDC(trace_id=trace_id):
|
|
246
248
|
try:
|
|
247
|
-
self.logger.
|
|
249
|
+
self.logger.debug(f"FeaturesEnricher created from existing search: {search_id}")
|
|
248
250
|
self._search_task = search_task.poll_result(trace_id, quiet=True, check_fit=True)
|
|
249
251
|
file_metadata = self._search_task.get_file_metadata(trace_id)
|
|
250
252
|
x_columns = [c.originalName or c.name for c in file_metadata.columns]
|
|
251
253
|
self.__prepare_feature_importances(trace_id, x_columns)
|
|
252
254
|
# TODO validate search_keys with search_keys from file_metadata
|
|
253
|
-
print(bundle.get("search_by_task_id_finish"))
|
|
254
|
-
self.logger.
|
|
255
|
+
print(self.bundle.get("search_by_task_id_finish"))
|
|
256
|
+
self.logger.debug(f"Successfully initialized with search_id: {search_id}")
|
|
255
257
|
except HttpError as e:
|
|
256
258
|
if "Interrupted by client" in e.args[0]:
|
|
257
259
|
raise ValidationError("Search was cancelled")
|
|
258
260
|
except Exception as e:
|
|
259
|
-
print(bundle.get("failed_search_by_task_id"))
|
|
261
|
+
print(self.bundle.get("failed_search_by_task_id"))
|
|
260
262
|
self.logger.exception(f"Failed to find search_id: {search_id}")
|
|
261
263
|
raise e
|
|
262
264
|
|
|
@@ -277,13 +279,13 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
277
279
|
self.round_embeddings = round_embeddings
|
|
278
280
|
if generate_features is not None:
|
|
279
281
|
if len(generate_features) > self.GENERATE_FEATURES_LIMIT:
|
|
280
|
-
msg = bundle.get("too_many_generate_features").format(self.GENERATE_FEATURES_LIMIT)
|
|
282
|
+
msg = self.bundle.get("too_many_generate_features").format(self.GENERATE_FEATURES_LIMIT)
|
|
281
283
|
self.logger.error(msg)
|
|
282
284
|
raise ValidationError(msg)
|
|
283
285
|
self.runtime_parameters.properties["generate_features"] = ",".join(generate_features)
|
|
284
286
|
if round_embeddings is not None:
|
|
285
287
|
if not isinstance(round_embeddings, int) or round_embeddings < 0:
|
|
286
|
-
msg = bundle.get("invalid_round_embeddings")
|
|
288
|
+
msg = self.bundle.get("invalid_round_embeddings")
|
|
287
289
|
self.logger.error(msg)
|
|
288
290
|
raise ValidationError(msg)
|
|
289
291
|
self.runtime_parameters.properties["round_embeddings"] = round_embeddings
|
|
@@ -309,7 +311,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
309
311
|
api_key = property(_get_api_key, _set_api_key)
|
|
310
312
|
|
|
311
313
|
@staticmethod
|
|
312
|
-
def _check_eval_set(eval_set, X):
|
|
314
|
+
def _check_eval_set(eval_set, X, bundle: ResourceBundle):
|
|
313
315
|
checked_eval_set = []
|
|
314
316
|
if eval_set is not None and isinstance(eval_set, tuple):
|
|
315
317
|
eval_set = [eval_set]
|
|
@@ -318,7 +320,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
318
320
|
for eval_pair in eval_set or []:
|
|
319
321
|
if not isinstance(eval_pair, tuple) or len(eval_pair) != 2:
|
|
320
322
|
raise ValidationError(bundle.get("eval_set_invalid_tuple_size").format(len(eval_pair)))
|
|
321
|
-
if not is_frames_equal(X, eval_pair[0]):
|
|
323
|
+
if not is_frames_equal(X, eval_pair[0], bundle):
|
|
322
324
|
checked_eval_set.append(eval_pair)
|
|
323
325
|
return checked_eval_set
|
|
324
326
|
|
|
@@ -401,7 +403,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
401
403
|
try:
|
|
402
404
|
self.X = X
|
|
403
405
|
self.y = y
|
|
404
|
-
self.eval_set = self._check_eval_set(eval_set, X)
|
|
406
|
+
self.eval_set = self._check_eval_set(eval_set, X, self.bundle)
|
|
405
407
|
self.dump_input(trace_id, X, y, eval_set)
|
|
406
408
|
self.__inner_fit(
|
|
407
409
|
trace_id,
|
|
@@ -439,7 +441,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
439
441
|
if len(e.args) > 0 and (
|
|
440
442
|
"File doesn't intersect with any ADS" in str(e.args[0]) or "Empty intersection" in str(e.args[0])
|
|
441
443
|
):
|
|
442
|
-
self.__display_support_link(bundle.get("features_info_zero_important_features"))
|
|
444
|
+
self.__display_support_link(self.bundle.get("features_info_zero_important_features"))
|
|
443
445
|
elif isinstance(e, ValidationError):
|
|
444
446
|
self._dump_python_libs()
|
|
445
447
|
self._show_error(str(e))
|
|
@@ -540,11 +542,13 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
540
542
|
try:
|
|
541
543
|
self.X = X
|
|
542
544
|
self.y = y
|
|
543
|
-
self.eval_set = self._check_eval_set(eval_set, X)
|
|
545
|
+
self.eval_set = self._check_eval_set(eval_set, X, self.bundle)
|
|
544
546
|
self.dump_input(trace_id, X, y, eval_set)
|
|
545
547
|
|
|
546
548
|
if _num_samples(drop_duplicates(X)) > Dataset.MAX_ROWS:
|
|
547
|
-
raise ValidationError(
|
|
549
|
+
raise ValidationError(
|
|
550
|
+
self.bundle.get("dataset_too_many_rows_registered").format(Dataset.MAX_ROWS)
|
|
551
|
+
)
|
|
548
552
|
|
|
549
553
|
self.__inner_fit(
|
|
550
554
|
trace_id,
|
|
@@ -581,7 +585,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
581
585
|
if len(e.args) > 0 and (
|
|
582
586
|
"File doesn't intersect with any ADS" in str(e.args[0]) or "Empty intersection" in str(e.args[0])
|
|
583
587
|
):
|
|
584
|
-
self.__display_support_link(bundle.get("features_info_zero_important_features"))
|
|
588
|
+
self.__display_support_link(self.bundle.get("features_info_zero_important_features"))
|
|
585
589
|
return None
|
|
586
590
|
elif isinstance(e, ValidationError):
|
|
587
591
|
self._dump_python_libs()
|
|
@@ -677,11 +681,11 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
677
681
|
self.__validate_search_keys(self.search_keys, self.search_id)
|
|
678
682
|
try:
|
|
679
683
|
if len(self.feature_names_) == 0:
|
|
680
|
-
self.logger.warning(bundle.get("no_important_features_for_transform"))
|
|
684
|
+
self.logger.warning(self.bundle.get("no_important_features_for_transform"))
|
|
681
685
|
return X
|
|
682
686
|
|
|
683
687
|
if self._has_paid_features(exclude_features_sources):
|
|
684
|
-
msg = bundle.get("transform_with_paid_features")
|
|
688
|
+
msg = self.bundle.get("transform_with_paid_features")
|
|
685
689
|
self.logger.warning(msg)
|
|
686
690
|
self.__display_support_link(msg)
|
|
687
691
|
return None
|
|
@@ -691,13 +695,15 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
691
695
|
self.logger.info(f"Current transform usage: {transform_usage}. Transforming {len(X)} rows")
|
|
692
696
|
if transform_usage.has_limit:
|
|
693
697
|
if len(X) > transform_usage.rest_rows:
|
|
694
|
-
msg = bundle.get("transform_usage_warning").format(
|
|
698
|
+
msg = self.bundle.get("transform_usage_warning").format(
|
|
699
|
+
len(X), transform_usage.rest_rows
|
|
700
|
+
)
|
|
695
701
|
self.logger.warning(msg)
|
|
696
702
|
print(msg)
|
|
697
703
|
show_request_quote_button()
|
|
698
704
|
return None
|
|
699
705
|
else:
|
|
700
|
-
msg = bundle.get("transform_usage_info").format(
|
|
706
|
+
msg = self.bundle.get("transform_usage_info").format(
|
|
701
707
|
transform_usage.limit, transform_usage.transformed_rows
|
|
702
708
|
)
|
|
703
709
|
self.logger.info("transform_usage_warning")
|
|
@@ -735,13 +741,13 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
735
741
|
if len(e.args) > 0 and (
|
|
736
742
|
"File doesn't intersect with any ADS" in str(e.args[0]) or "Empty intersection" in str(e.args[0])
|
|
737
743
|
):
|
|
738
|
-
self.__display_support_link(bundle.get("features_info_zero_important_features"))
|
|
744
|
+
self.__display_support_link(self.bundle.get("features_info_zero_important_features"))
|
|
739
745
|
return None
|
|
740
746
|
elif len(e.args) > 0 and (
|
|
741
747
|
"You have reached the quota limit of trial data usage" in str(e.args[0])
|
|
742
748
|
or "Current user hasn't access to trial features" in str(e.args[0])
|
|
743
749
|
):
|
|
744
|
-
self.__display_support_link(bundle.get("trial_quota_limit_riched"))
|
|
750
|
+
self.__display_support_link(self.bundle.get("trial_quota_limit_riched"))
|
|
745
751
|
return None
|
|
746
752
|
elif isinstance(e, ValidationError):
|
|
747
753
|
self._dump_python_libs()
|
|
@@ -858,7 +864,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
858
864
|
or (self.X is None and X is None)
|
|
859
865
|
or (self.y is None and y is None)
|
|
860
866
|
):
|
|
861
|
-
raise ValidationError(bundle.get("metrics_unfitted_enricher"))
|
|
867
|
+
raise ValidationError(self.bundle.get("metrics_unfitted_enricher"))
|
|
862
868
|
|
|
863
869
|
if X is not None and y is None:
|
|
864
870
|
raise ValidationError("X passed without y")
|
|
@@ -866,18 +872,12 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
866
872
|
effective_X = X if X is not None else self.X
|
|
867
873
|
effective_eval_set = eval_set if eval_set is not None else self.eval_set
|
|
868
874
|
|
|
869
|
-
effective_X = X if X is not None else self.X
|
|
870
|
-
effective_eval_set = eval_set if eval_set is not None else self.eval_set
|
|
871
|
-
|
|
872
|
-
effective_X = X if X is not None else self.X
|
|
873
|
-
effective_eval_set = eval_set if eval_set is not None else self.eval_set
|
|
874
|
-
|
|
875
875
|
validate_scoring_argument(scoring)
|
|
876
876
|
|
|
877
877
|
self._validate_baseline_score(effective_X, effective_eval_set)
|
|
878
878
|
|
|
879
879
|
if self._has_paid_features(exclude_features_sources):
|
|
880
|
-
msg = bundle.get("metrics_with_paid_features")
|
|
880
|
+
msg = self.bundle.get("metrics_with_paid_features")
|
|
881
881
|
self.logger.warning(msg)
|
|
882
882
|
self.__display_support_link(msg)
|
|
883
883
|
return None
|
|
@@ -898,7 +898,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
898
898
|
if self.search_keys[cat_feature] in [SearchKey.COUNTRY, SearchKey.POSTAL_CODE]:
|
|
899
899
|
search_keys_for_metrics.append(cat_feature)
|
|
900
900
|
else:
|
|
901
|
-
raise ValidationError(bundle.get("cat_feature_search_key").format(cat_feature))
|
|
901
|
+
raise ValidationError(self.bundle.get("cat_feature_search_key").format(cat_feature))
|
|
902
902
|
|
|
903
903
|
prepared_data = self._prepare_data_for_metrics(
|
|
904
904
|
trace_id=trace_id,
|
|
@@ -928,10 +928,10 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
928
928
|
|
|
929
929
|
gc.collect()
|
|
930
930
|
|
|
931
|
-
print(bundle.get("metrics_start"))
|
|
931
|
+
print(self.bundle.get("metrics_start"))
|
|
932
932
|
with Spinner():
|
|
933
933
|
if fitting_X.shape[1] == 0 and fitting_enriched_X.shape[1] == 0:
|
|
934
|
-
print(bundle.get("metrics_no_important_free_features"))
|
|
934
|
+
print(self.bundle.get("metrics_no_important_free_features"))
|
|
935
935
|
self.logger.warning("No client or free relevant ADS features found to calculate metrics")
|
|
936
936
|
self.warning_counter.increment()
|
|
937
937
|
return None
|
|
@@ -1025,20 +1025,25 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1025
1025
|
effective_X = X if X is not None else self.X
|
|
1026
1026
|
effective_y = y if y is not None else self.y
|
|
1027
1027
|
train_metrics = {
|
|
1028
|
-
bundle.get("quality_metrics_segment_header"): bundle.get(
|
|
1029
|
-
|
|
1030
|
-
|
|
1028
|
+
self.bundle.get("quality_metrics_segment_header"): self.bundle.get(
|
|
1029
|
+
"quality_metrics_train_segment"
|
|
1030
|
+
),
|
|
1031
|
+
self.bundle.get("quality_metrics_rows_header"): _num_samples(effective_X),
|
|
1031
1032
|
}
|
|
1032
1033
|
if model_task_type in [ModelTaskType.BINARY, ModelTaskType.REGRESSION] and is_numeric_dtype(
|
|
1033
1034
|
y_sorted
|
|
1034
1035
|
):
|
|
1035
|
-
train_metrics[bundle.get("quality_metrics_mean_target_header")] = round(
|
|
1036
|
+
train_metrics[self.bundle.get("quality_metrics_mean_target_header")] = round(
|
|
1037
|
+
np.mean(effective_y), 4
|
|
1038
|
+
)
|
|
1036
1039
|
if etalon_metric is not None:
|
|
1037
|
-
train_metrics[bundle.get("quality_metrics_baseline_header").format(metric)] = etalon_metric
|
|
1040
|
+
train_metrics[self.bundle.get("quality_metrics_baseline_header").format(metric)] = etalon_metric
|
|
1038
1041
|
if enriched_metric is not None:
|
|
1039
|
-
train_metrics[
|
|
1042
|
+
train_metrics[
|
|
1043
|
+
self.bundle.get("quality_metrics_enriched_header").format(metric)
|
|
1044
|
+
] = enriched_metric
|
|
1040
1045
|
if uplift is not None:
|
|
1041
|
-
train_metrics[bundle.get("quality_metrics_uplift_header")] = uplift
|
|
1046
|
+
train_metrics[self.bundle.get("quality_metrics_uplift_header")] = uplift
|
|
1042
1047
|
metrics = [train_metrics]
|
|
1043
1048
|
|
|
1044
1049
|
# 3 If eval_set is presented - fit final model on train enriched data and score each
|
|
@@ -1090,40 +1095,42 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1090
1095
|
|
|
1091
1096
|
effective_eval_set = eval_set if eval_set is not None else self.eval_set
|
|
1092
1097
|
eval_metrics = {
|
|
1093
|
-
bundle.get("quality_metrics_segment_header"): bundle.get(
|
|
1098
|
+
self.bundle.get("quality_metrics_segment_header"): self.bundle.get(
|
|
1094
1099
|
"quality_metrics_eval_segment"
|
|
1095
1100
|
).format(idx + 1),
|
|
1096
|
-
bundle.get("quality_metrics_rows_header"): _num_samples(
|
|
1097
|
-
|
|
1101
|
+
self.bundle.get("quality_metrics_rows_header"): _num_samples(
|
|
1102
|
+
effective_eval_set[idx][0]
|
|
1103
|
+
),
|
|
1104
|
+
# self.bundle.get("quality_metrics_match_rate_header"): eval_hit_rate,
|
|
1098
1105
|
}
|
|
1099
1106
|
if model_task_type in [ModelTaskType.BINARY, ModelTaskType.REGRESSION] and is_numeric_dtype(
|
|
1100
1107
|
eval_y_sorted
|
|
1101
1108
|
):
|
|
1102
|
-
eval_metrics[bundle.get("quality_metrics_mean_target_header")] = round(
|
|
1109
|
+
eval_metrics[self.bundle.get("quality_metrics_mean_target_header")] = round(
|
|
1103
1110
|
np.mean(effective_eval_set[idx][1]), 4
|
|
1104
1111
|
)
|
|
1105
1112
|
if etalon_eval_metric is not None:
|
|
1106
1113
|
eval_metrics[
|
|
1107
|
-
bundle.get("quality_metrics_baseline_header").format(metric)
|
|
1114
|
+
self.bundle.get("quality_metrics_baseline_header").format(metric)
|
|
1108
1115
|
] = etalon_eval_metric
|
|
1109
1116
|
if enriched_eval_metric is not None:
|
|
1110
1117
|
eval_metrics[
|
|
1111
|
-
bundle.get("quality_metrics_enriched_header").format(metric)
|
|
1118
|
+
self.bundle.get("quality_metrics_enriched_header").format(metric)
|
|
1112
1119
|
] = enriched_eval_metric
|
|
1113
1120
|
if eval_uplift is not None:
|
|
1114
|
-
eval_metrics[bundle.get("quality_metrics_uplift_header")] = eval_uplift
|
|
1121
|
+
eval_metrics[self.bundle.get("quality_metrics_uplift_header")] = eval_uplift
|
|
1115
1122
|
|
|
1116
1123
|
metrics.append(eval_metrics)
|
|
1117
1124
|
|
|
1118
1125
|
metrics_df = pd.DataFrame(metrics)
|
|
1119
|
-
mean_target_hdr = bundle.get("quality_metrics_mean_target_header")
|
|
1126
|
+
mean_target_hdr = self.bundle.get("quality_metrics_mean_target_header")
|
|
1120
1127
|
if mean_target_hdr in metrics_df.columns:
|
|
1121
1128
|
metrics_df[mean_target_hdr] = metrics_df[mean_target_hdr].astype("float64")
|
|
1122
1129
|
do_without_pandas_limits(
|
|
1123
1130
|
lambda: self.logger.info(f"Metrics calculation finished successfully:\n{metrics_df}")
|
|
1124
1131
|
)
|
|
1125
1132
|
|
|
1126
|
-
uplift_col = bundle.get("quality_metrics_uplift_header")
|
|
1133
|
+
uplift_col = self.bundle.get("quality_metrics_uplift_header")
|
|
1127
1134
|
date_column = self._get_date_column(search_keys)
|
|
1128
1135
|
if (
|
|
1129
1136
|
uplift_col in metrics_df.columns
|
|
@@ -1133,7 +1140,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1133
1140
|
and date_column is not None
|
|
1134
1141
|
and is_time_series(validated_X, date_column)
|
|
1135
1142
|
):
|
|
1136
|
-
msg = bundle.get("metrics_negative_uplift_without_cv")
|
|
1143
|
+
msg = self.bundle.get("metrics_negative_uplift_without_cv")
|
|
1137
1144
|
self.logger.warning(msg)
|
|
1138
1145
|
self.__display_support_link(msg)
|
|
1139
1146
|
elif uplift_col in metrics_df.columns and (metrics_df[uplift_col] < 0).any():
|
|
@@ -1149,7 +1156,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1149
1156
|
"You have reached the quota limit of trial data usage" in str(e.args[0])
|
|
1150
1157
|
or "Current user hasn't access to trial features" in str(e.args[0])
|
|
1151
1158
|
):
|
|
1152
|
-
self.__display_support_link(bundle.get("trial_quota_limit_riched"))
|
|
1159
|
+
self.__display_support_link(self.bundle.get("trial_quota_limit_riched"))
|
|
1153
1160
|
elif isinstance(e, ValidationError):
|
|
1154
1161
|
self._dump_python_libs()
|
|
1155
1162
|
self._show_error(str(e))
|
|
@@ -1171,7 +1178,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1171
1178
|
if res[1] < 0.05:
|
|
1172
1179
|
uneven_distribution = True
|
|
1173
1180
|
if uneven_distribution:
|
|
1174
|
-
msg = bundle.get("uneven_eval_target_distribution")
|
|
1181
|
+
msg = self.bundle.get("uneven_eval_target_distribution")
|
|
1175
1182
|
print(msg)
|
|
1176
1183
|
self.logger.warning(msg)
|
|
1177
1184
|
|
|
@@ -1185,14 +1192,14 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1185
1192
|
) -> List[str]:
|
|
1186
1193
|
if exclude_features_sources:
|
|
1187
1194
|
filtered_features_info = self.features_info[
|
|
1188
|
-
~self.features_info[bundle.get("features_info_name")].isin(exclude_features_sources)
|
|
1195
|
+
~self.features_info[self.bundle.get("features_info_name")].isin(exclude_features_sources)
|
|
1189
1196
|
]
|
|
1190
1197
|
else:
|
|
1191
1198
|
filtered_features_info = self.features_info
|
|
1192
1199
|
return list(
|
|
1193
1200
|
filtered_features_info.loc[
|
|
1194
|
-
filtered_features_info[bundle.get("features_info_commercial_schema")] == commercial_schema,
|
|
1195
|
-
bundle.get("features_info_name"),
|
|
1201
|
+
filtered_features_info[self.bundle.get("features_info_commercial_schema")] == commercial_schema,
|
|
1202
|
+
self.bundle.get("features_info_name"),
|
|
1196
1203
|
].values
|
|
1197
1204
|
)
|
|
1198
1205
|
|
|
@@ -1239,7 +1246,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1239
1246
|
if X is None:
|
|
1240
1247
|
return True, self.X, self.y, self.eval_set
|
|
1241
1248
|
|
|
1242
|
-
checked_eval_set = self._check_eval_set(eval_set, X)
|
|
1249
|
+
checked_eval_set = self._check_eval_set(eval_set, X, self.bundle)
|
|
1243
1250
|
|
|
1244
1251
|
if (
|
|
1245
1252
|
X is self.X
|
|
@@ -1280,7 +1287,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1280
1287
|
is_demo_dataset = hash_input(X, y, eval_set) in DEMO_DATASET_HASHES
|
|
1281
1288
|
validated_X = self._validate_X(X)
|
|
1282
1289
|
validated_y = self._validate_y(validated_X, y)
|
|
1283
|
-
checked_eval_set = self._check_eval_set(eval_set, X)
|
|
1290
|
+
checked_eval_set = self._check_eval_set(eval_set, X, self.bundle)
|
|
1284
1291
|
validated_eval_set = (
|
|
1285
1292
|
[self._validate_eval_set_pair(validated_X, eval_set_pair) for eval_set_pair in checked_eval_set]
|
|
1286
1293
|
if checked_eval_set
|
|
@@ -1409,7 +1416,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1409
1416
|
return self.__sample_balanced(eval_set, trace_id, remove_outliers_calc_metrics)
|
|
1410
1417
|
else:
|
|
1411
1418
|
self.logger.info("Dataset is imbalanced or exclude_features_sources or X was passed. Run transform")
|
|
1412
|
-
print(bundle.get("prepare_data_for_metrics"))
|
|
1419
|
+
print(self.bundle.get("prepare_data_for_metrics"))
|
|
1413
1420
|
return self.__sample_imbalanced(
|
|
1414
1421
|
validated_X,
|
|
1415
1422
|
validated_y,
|
|
@@ -1503,7 +1510,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1503
1510
|
not_msg = ""
|
|
1504
1511
|
else:
|
|
1505
1512
|
not_msg = "not "
|
|
1506
|
-
msg = bundle.get("target_outliers_warning").format(len(target_outliers_df), top_outliers, not_msg)
|
|
1513
|
+
msg = self.bundle.get("target_outliers_warning").format(len(target_outliers_df), top_outliers, not_msg)
|
|
1507
1514
|
print(msg)
|
|
1508
1515
|
self.logger.warning(msg)
|
|
1509
1516
|
|
|
@@ -1529,7 +1536,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1529
1536
|
if eval_set is not None:
|
|
1530
1537
|
if len(enriched_eval_sets) != len(eval_set):
|
|
1531
1538
|
raise ValidationError(
|
|
1532
|
-
bundle.get("metrics_eval_set_count_diff").format(len(enriched_eval_sets), len(eval_set))
|
|
1539
|
+
self.bundle.get("metrics_eval_set_count_diff").format(len(enriched_eval_sets), len(eval_set))
|
|
1533
1540
|
)
|
|
1534
1541
|
|
|
1535
1542
|
for idx in range(len(eval_set)):
|
|
@@ -1680,7 +1687,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1680
1687
|
def get_features_info(self) -> pd.DataFrame:
|
|
1681
1688
|
"""Returns pandas.DataFrame with SHAP values and other info for each feature."""
|
|
1682
1689
|
if self._search_task is None or self._search_task.summary is None:
|
|
1683
|
-
msg = bundle.get("features_unfitted_enricher")
|
|
1690
|
+
msg = self.bundle.get("features_unfitted_enricher")
|
|
1684
1691
|
self.logger.warning(msg)
|
|
1685
1692
|
raise NotFittedError(msg)
|
|
1686
1693
|
|
|
@@ -1694,9 +1701,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1694
1701
|
|
|
1695
1702
|
def get_transactional_transform_api(self):
|
|
1696
1703
|
if self.api_key is None:
|
|
1697
|
-
raise ValidationError(bundle.get("transactional_transform_unregistered"))
|
|
1704
|
+
raise ValidationError(self.bundle.get("transactional_transform_unregistered"))
|
|
1698
1705
|
if self._search_task is None:
|
|
1699
|
-
raise ValidationError(bundle.get("transactional_transform_unfited"))
|
|
1706
|
+
raise ValidationError(self.bundle.get("transactional_transform_unfited"))
|
|
1700
1707
|
|
|
1701
1708
|
def key_example(key: SearchKey):
|
|
1702
1709
|
if key == SearchKey.COUNTRY:
|
|
@@ -1761,7 +1768,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1761
1768
|
) -> pd.DataFrame:
|
|
1762
1769
|
with MDC(trace_id=trace_id):
|
|
1763
1770
|
if self._search_task is None:
|
|
1764
|
-
raise NotFittedError(bundle.get("transform_unfitted_enricher"))
|
|
1771
|
+
raise NotFittedError(self.bundle.get("transform_unfitted_enricher"))
|
|
1765
1772
|
|
|
1766
1773
|
validated_X = self._validate_X(X, is_transform=True)
|
|
1767
1774
|
|
|
@@ -1773,13 +1780,13 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1773
1780
|
and not self.__is_registered
|
|
1774
1781
|
and not is_demo_dataset
|
|
1775
1782
|
):
|
|
1776
|
-
msg = bundle.get("transform_with_trial_features")
|
|
1783
|
+
msg = self.bundle.get("transform_with_trial_features")
|
|
1777
1784
|
self.logger.warning(msg)
|
|
1778
1785
|
print(msg)
|
|
1779
1786
|
|
|
1780
1787
|
columns_to_drop = [c for c in validated_X.columns if c in self.feature_names_]
|
|
1781
1788
|
if len(columns_to_drop) > 0:
|
|
1782
|
-
msg = bundle.get("x_contains_enriching_columns").format(columns_to_drop)
|
|
1789
|
+
msg = self.bundle.get("x_contains_enriching_columns").format(columns_to_drop)
|
|
1783
1790
|
self.logger.warning(msg)
|
|
1784
1791
|
print(msg)
|
|
1785
1792
|
validated_X = validated_X.drop(columns=columns_to_drop)
|
|
@@ -1796,7 +1803,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1796
1803
|
df = self.__handle_index_search_keys(df, search_keys)
|
|
1797
1804
|
|
|
1798
1805
|
if DEFAULT_INDEX in df.columns:
|
|
1799
|
-
msg = bundle.get("unsupported_index_column")
|
|
1806
|
+
msg = self.bundle.get("unsupported_index_column")
|
|
1800
1807
|
self.logger.info(msg)
|
|
1801
1808
|
print(msg)
|
|
1802
1809
|
df.drop(columns=DEFAULT_INDEX, inplace=True)
|
|
@@ -1909,9 +1916,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1909
1916
|
gc.collect()
|
|
1910
1917
|
|
|
1911
1918
|
if not silent_mode:
|
|
1912
|
-
print(bundle.get("polling_search_task").format(validation_task.search_task_id))
|
|
1919
|
+
print(self.bundle.get("polling_search_task").format(validation_task.search_task_id))
|
|
1913
1920
|
if not self.__is_registered:
|
|
1914
|
-
print(bundle.get("polling_unregister_information"))
|
|
1921
|
+
print(self.bundle.get("polling_unregister_information"))
|
|
1915
1922
|
|
|
1916
1923
|
progress = self.get_progress(trace_id, validation_task)
|
|
1917
1924
|
progress.recalculate_eta(time.time() - start_time)
|
|
@@ -1937,10 +1944,10 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1937
1944
|
time.sleep(polling_period_seconds)
|
|
1938
1945
|
progress = self.get_progress(trace_id, validation_task)
|
|
1939
1946
|
except KeyboardInterrupt as e:
|
|
1940
|
-
print(bundle.get("search_stopping"))
|
|
1947
|
+
print(self.bundle.get("search_stopping"))
|
|
1941
1948
|
self.rest_client.stop_search_task_v2(trace_id, validation_task.search_task_id)
|
|
1942
1949
|
self.logger.warning(f"Search {validation_task.search_task_id} stopped by user")
|
|
1943
|
-
print(bundle.get("search_stopped"))
|
|
1950
|
+
print(self.bundle.get("search_stopped"))
|
|
1944
1951
|
raise e
|
|
1945
1952
|
|
|
1946
1953
|
validation_task.poll_result(trace_id, quiet=True)
|
|
@@ -1962,7 +1969,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1962
1969
|
return res
|
|
1963
1970
|
|
|
1964
1971
|
if not silent_mode:
|
|
1965
|
-
print(bundle.get("transform_start"))
|
|
1972
|
+
print(self.bundle.get("transform_start"))
|
|
1966
1973
|
# with Spinner():
|
|
1967
1974
|
result = enrich()
|
|
1968
1975
|
else:
|
|
@@ -1976,9 +1983,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1976
1983
|
|
|
1977
1984
|
def _get_excluded_features(self, max_features: Optional[int], importance_threshold: Optional[float]) -> List[str]:
|
|
1978
1985
|
features_info = self._internal_features_info
|
|
1979
|
-
comm_schema_header = bundle.get("features_info_commercial_schema")
|
|
1980
|
-
shap_value_header = bundle.get("features_info_shap")
|
|
1981
|
-
feature_name_header = bundle.get("features_info_name")
|
|
1986
|
+
comm_schema_header = self.bundle.get("features_info_commercial_schema")
|
|
1987
|
+
shap_value_header = self.bundle.get("features_info_shap")
|
|
1988
|
+
feature_name_header = self.bundle.get("features_info_name")
|
|
1982
1989
|
external_features = features_info[features_info[comm_schema_header].str.len() > 0]
|
|
1983
1990
|
filtered_features = external_features
|
|
1984
1991
|
if importance_threshold is not None:
|
|
@@ -2009,28 +2016,28 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2009
2016
|
return
|
|
2010
2017
|
else:
|
|
2011
2018
|
self.logger.warning("search_keys not provided")
|
|
2012
|
-
raise ValidationError(bundle.get("empty_search_keys"))
|
|
2019
|
+
raise ValidationError(self.bundle.get("empty_search_keys"))
|
|
2013
2020
|
|
|
2014
2021
|
key_types = search_keys.values()
|
|
2015
2022
|
|
|
2016
2023
|
if SearchKey.DATE in key_types and SearchKey.DATETIME in key_types:
|
|
2017
|
-
msg = bundle.get("date_and_datetime_simultanious")
|
|
2024
|
+
msg = self.bundle.get("date_and_datetime_simultanious")
|
|
2018
2025
|
self.logger.warning(msg)
|
|
2019
2026
|
raise ValidationError(msg)
|
|
2020
2027
|
|
|
2021
2028
|
if SearchKey.EMAIL in key_types and SearchKey.HEM in key_types:
|
|
2022
|
-
msg = bundle.get("email_and_hem_simultanious")
|
|
2029
|
+
msg = self.bundle.get("email_and_hem_simultanious")
|
|
2023
2030
|
self.logger.warning(msg)
|
|
2024
2031
|
raise ValidationError(msg)
|
|
2025
2032
|
|
|
2026
2033
|
if SearchKey.POSTAL_CODE in key_types and SearchKey.COUNTRY not in key_types and self.country_code is None:
|
|
2027
|
-
msg = bundle.get("postal_code_without_country")
|
|
2034
|
+
msg = self.bundle.get("postal_code_without_country")
|
|
2028
2035
|
self.logger.warning(msg)
|
|
2029
2036
|
raise ValidationError(msg)
|
|
2030
2037
|
|
|
2031
2038
|
for key_type in SearchKey.__members__.values():
|
|
2032
2039
|
if key_type != SearchKey.CUSTOM_KEY and list(key_types).count(key_type) > 1:
|
|
2033
|
-
msg = bundle.get("multiple_search_key").format(key_type)
|
|
2040
|
+
msg = self.bundle.get("multiple_search_key").format(key_type)
|
|
2034
2041
|
self.logger.warning(msg)
|
|
2035
2042
|
raise ValidationError(msg)
|
|
2036
2043
|
|
|
@@ -2040,7 +2047,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2040
2047
|
# and not is_demo_dataset
|
|
2041
2048
|
# and len(set(key_types).intersection(non_personal_keys)) == 0
|
|
2042
2049
|
# ):
|
|
2043
|
-
# msg = bundle.get("unregistered_only_personal_keys")
|
|
2050
|
+
# msg = self.bundle.get("unregistered_only_personal_keys")
|
|
2044
2051
|
# self.logger.warning(msg + f" Provided search keys: {key_types}")
|
|
2045
2052
|
# raise ValidationError(msg)
|
|
2046
2053
|
|
|
@@ -2081,19 +2088,22 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2081
2088
|
)
|
|
2082
2089
|
is_demo_dataset = hash_input(validated_X, validated_y, validated_eval_set) in DEMO_DATASET_HASHES
|
|
2083
2090
|
if is_demo_dataset:
|
|
2084
|
-
msg = bundle.get("demo_dataset_info")
|
|
2091
|
+
msg = self.bundle.get("demo_dataset_info")
|
|
2085
2092
|
self.logger.info(msg)
|
|
2086
2093
|
if not self.__is_registered:
|
|
2087
2094
|
print(msg)
|
|
2088
2095
|
|
|
2089
2096
|
if self.generate_features is not None and len(self.generate_features) > 0:
|
|
2090
2097
|
x_columns = list(validated_X.columns)
|
|
2098
|
+
checked_generate_features = []
|
|
2091
2099
|
for gen_feature in self.generate_features:
|
|
2092
2100
|
if gen_feature not in x_columns:
|
|
2093
|
-
self.
|
|
2094
|
-
msg = bundle.get("missing_generate_feature").format(gen_feature, x_columns)
|
|
2101
|
+
msg = self.bundle.get("missing_generate_feature").format(gen_feature, x_columns)
|
|
2095
2102
|
print(msg)
|
|
2096
2103
|
self.logger.warning(msg)
|
|
2104
|
+
else:
|
|
2105
|
+
checked_generate_features.append(gen_feature)
|
|
2106
|
+
self.generate_features = checked_generate_features
|
|
2097
2107
|
self.runtime_parameters.properties["generate_features"] = ",".join(self.generate_features)
|
|
2098
2108
|
|
|
2099
2109
|
validate_scoring_argument(scoring)
|
|
@@ -2134,7 +2144,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2134
2144
|
df = pd.concat([df, eval_df])
|
|
2135
2145
|
|
|
2136
2146
|
if DEFAULT_INDEX in df.columns:
|
|
2137
|
-
msg = bundle.get("unsupported_index_column")
|
|
2147
|
+
msg = self.bundle.get("unsupported_index_column")
|
|
2138
2148
|
self.logger.info(msg)
|
|
2139
2149
|
print(msg)
|
|
2140
2150
|
self.fit_dropped_features.add(DEFAULT_INDEX)
|
|
@@ -2237,9 +2247,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2237
2247
|
if search_id_callback is not None:
|
|
2238
2248
|
search_id_callback(self._search_task.search_task_id)
|
|
2239
2249
|
|
|
2240
|
-
print(bundle.get("polling_search_task").format(self._search_task.search_task_id))
|
|
2250
|
+
print(self.bundle.get("polling_search_task").format(self._search_task.search_task_id))
|
|
2241
2251
|
if not self.__is_registered:
|
|
2242
|
-
print(bundle.get("polling_unregister_information"))
|
|
2252
|
+
print(self.bundle.get("polling_unregister_information"))
|
|
2243
2253
|
|
|
2244
2254
|
progress = self.get_progress(trace_id)
|
|
2245
2255
|
prev_progress = None
|
|
@@ -2265,14 +2275,14 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2265
2275
|
f"Search {self._search_task.search_task_id} failed with error {progress.error}"
|
|
2266
2276
|
f" and message {progress.error_message}"
|
|
2267
2277
|
)
|
|
2268
|
-
raise RuntimeError(bundle.get("search_task_failed_status"))
|
|
2278
|
+
raise RuntimeError(self.bundle.get("search_task_failed_status"))
|
|
2269
2279
|
time.sleep(poll_period_seconds)
|
|
2270
2280
|
progress = self.get_progress(trace_id)
|
|
2271
2281
|
except KeyboardInterrupt as e:
|
|
2272
|
-
print(bundle.get("search_stopping"))
|
|
2282
|
+
print(self.bundle.get("search_stopping"))
|
|
2273
2283
|
self.rest_client.stop_search_task_v2(trace_id, self._search_task.search_task_id)
|
|
2274
2284
|
self.logger.warning(f"Search {self._search_task.search_task_id} stopped by user")
|
|
2275
|
-
print(bundle.get("search_stopped"))
|
|
2285
|
+
print(self.bundle.get("search_stopped"))
|
|
2276
2286
|
raise e
|
|
2277
2287
|
|
|
2278
2288
|
self._search_task.poll_result(trace_id, quiet=True)
|
|
@@ -2293,7 +2303,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2293
2303
|
)
|
|
2294
2304
|
zero_hit_columns = self.get_columns_by_search_keys(zero_hit_search_keys)
|
|
2295
2305
|
if zero_hit_columns:
|
|
2296
|
-
msg = bundle.get("features_info_zero_hit_rate_search_keys").format(zero_hit_columns)
|
|
2306
|
+
msg = self.bundle.get("features_info_zero_hit_rate_search_keys").format(zero_hit_columns)
|
|
2297
2307
|
self.logger.warning(msg)
|
|
2298
2308
|
self.__display_support_link(msg)
|
|
2299
2309
|
self.warning_counter.increment()
|
|
@@ -2305,7 +2315,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2305
2315
|
unused_features_for_generation = [
|
|
2306
2316
|
dataset.columns_renaming.get(col) or col for col in self._search_task.unused_features_for_generation
|
|
2307
2317
|
]
|
|
2308
|
-
msg = bundle.get("features_not_generated").format(unused_features_for_generation)
|
|
2318
|
+
msg = self.bundle.get("features_not_generated").format(unused_features_for_generation)
|
|
2309
2319
|
self.logger.warning(msg)
|
|
2310
2320
|
print(msg)
|
|
2311
2321
|
self.warning_counter.increment()
|
|
@@ -2320,7 +2330,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2320
2330
|
|
|
2321
2331
|
if self._has_paid_features(exclude_features_sources):
|
|
2322
2332
|
if calculate_metrics is not None and calculate_metrics:
|
|
2323
|
-
msg = bundle.get("metrics_with_paid_features")
|
|
2333
|
+
msg = self.bundle.get("metrics_with_paid_features")
|
|
2324
2334
|
self.logger.warning(msg)
|
|
2325
2335
|
self.__display_support_link(msg)
|
|
2326
2336
|
else:
|
|
@@ -2331,7 +2341,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2331
2341
|
if len(validated_X) < self.CALCULATE_METRICS_MIN_THRESHOLD or any(
|
|
2332
2342
|
[len(eval_X) < self.CALCULATE_METRICS_MIN_THRESHOLD for eval_X, _ in validated_eval_set]
|
|
2333
2343
|
):
|
|
2334
|
-
msg = bundle.get("too_small_for_metrics")
|
|
2344
|
+
msg = self.bundle.get("too_small_for_metrics")
|
|
2335
2345
|
self.logger.warning(msg)
|
|
2336
2346
|
calculate_metrics = False
|
|
2337
2347
|
elif len(dataset) * len(dataset.columns) > self.CALCULATE_METRICS_THRESHOLD:
|
|
@@ -2362,7 +2372,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2362
2372
|
self.__show_report_button()
|
|
2363
2373
|
|
|
2364
2374
|
if not self.warning_counter.has_warnings():
|
|
2365
|
-
self.__display_support_link(bundle.get("all_ok_community_invite"))
|
|
2375
|
+
self.__display_support_link(self.bundle.get("all_ok_community_invite"))
|
|
2366
2376
|
|
|
2367
2377
|
def __adjust_cv(self, df: pd.DataFrame, date_column: pd.Series, model_task_type: ModelTaskType):
|
|
2368
2378
|
# Check Multivariate time series
|
|
@@ -2373,14 +2383,14 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2373
2383
|
and len({SearchKey.PHONE, SearchKey.EMAIL, SearchKey.HEM}.intersection(self.fit_search_keys.keys())) == 0
|
|
2374
2384
|
and is_blocked_time_series(df, date_column, list(self.fit_search_keys.keys()) + [TARGET])
|
|
2375
2385
|
):
|
|
2376
|
-
msg = bundle.get("multivariate_timeseries_detected")
|
|
2386
|
+
msg = self.bundle.get("multivariate_timeseries_detected")
|
|
2377
2387
|
self.__override_cv(CVType.blocked_time_series, msg, print_warning=False)
|
|
2378
2388
|
elif (
|
|
2379
2389
|
self.cv is None
|
|
2380
2390
|
and model_task_type != ModelTaskType.REGRESSION
|
|
2381
2391
|
and self._get_group_columns(df, self.fit_search_keys)
|
|
2382
2392
|
):
|
|
2383
|
-
msg = bundle.get("group_k_fold_in_classification")
|
|
2393
|
+
msg = self.bundle.get("group_k_fold_in_classification")
|
|
2384
2394
|
self.__override_cv(CVType.group_k_fold, msg, print_warning=self.cv is not None)
|
|
2385
2395
|
|
|
2386
2396
|
def __override_cv(self, cv: CVType, msg: str, print_warning: bool = True):
|
|
@@ -2400,11 +2410,11 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2400
2410
|
|
|
2401
2411
|
def _validate_X(self, X, is_transform=False) -> pd.DataFrame:
|
|
2402
2412
|
if _num_samples(X) == 0:
|
|
2403
|
-
raise ValidationError(bundle.get("x_is_empty"))
|
|
2413
|
+
raise ValidationError(self.bundle.get("x_is_empty"))
|
|
2404
2414
|
|
|
2405
2415
|
if isinstance(X, pd.DataFrame):
|
|
2406
2416
|
if isinstance(X.columns, pd.MultiIndex) or isinstance(X.index, pd.MultiIndex):
|
|
2407
|
-
raise ValidationError(bundle.get("x_multiindex_unsupported"))
|
|
2417
|
+
raise ValidationError(self.bundle.get("x_multiindex_unsupported"))
|
|
2408
2418
|
validated_X = X.copy()
|
|
2409
2419
|
elif isinstance(X, pd.Series):
|
|
2410
2420
|
validated_X = X.to_frame()
|
|
@@ -2413,12 +2423,12 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2413
2423
|
renaming = {c: str(c) for c in validated_X.columns}
|
|
2414
2424
|
validated_X = validated_X.rename(columns=renaming)
|
|
2415
2425
|
else:
|
|
2416
|
-
raise ValidationError(bundle.get("unsupported_x_type").format(type(X)))
|
|
2426
|
+
raise ValidationError(self.bundle.get("unsupported_x_type").format(type(X)))
|
|
2417
2427
|
|
|
2418
2428
|
if len(set(validated_X.columns)) != len(validated_X.columns):
|
|
2419
|
-
raise ValidationError(bundle.get("x_contains_dup_columns"))
|
|
2429
|
+
raise ValidationError(self.bundle.get("x_contains_dup_columns"))
|
|
2420
2430
|
if not is_transform and not validated_X.index.is_unique:
|
|
2421
|
-
raise ValidationError(bundle.get("x_non_unique_index"))
|
|
2431
|
+
raise ValidationError(self.bundle.get("x_non_unique_index"))
|
|
2422
2432
|
|
|
2423
2433
|
if self.exclude_columns is not None:
|
|
2424
2434
|
validated_X = validated_X.drop(columns=self.exclude_columns, errors="ignore")
|
|
@@ -2429,17 +2439,17 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2429
2439
|
)
|
|
2430
2440
|
|
|
2431
2441
|
if TARGET in validated_X.columns:
|
|
2432
|
-
raise ValidationError(bundle.get("x_contains_reserved_column_name").format(TARGET))
|
|
2442
|
+
raise ValidationError(self.bundle.get("x_contains_reserved_column_name").format(TARGET))
|
|
2433
2443
|
if not is_transform and EVAL_SET_INDEX in validated_X.columns:
|
|
2434
|
-
raise ValidationError(bundle.get("x_contains_reserved_column_name").format(EVAL_SET_INDEX))
|
|
2444
|
+
raise ValidationError(self.bundle.get("x_contains_reserved_column_name").format(EVAL_SET_INDEX))
|
|
2435
2445
|
if SYSTEM_RECORD_ID in validated_X.columns:
|
|
2436
|
-
raise ValidationError(bundle.get("x_contains_reserved_column_name").format(SYSTEM_RECORD_ID))
|
|
2446
|
+
raise ValidationError(self.bundle.get("x_contains_reserved_column_name").format(SYSTEM_RECORD_ID))
|
|
2437
2447
|
|
|
2438
2448
|
return validated_X
|
|
2439
2449
|
|
|
2440
2450
|
def _validate_y(self, X: pd.DataFrame, y) -> pd.Series:
|
|
2441
2451
|
if _num_samples(y) == 0:
|
|
2442
|
-
raise ValidationError(bundle.get("y_is_empty"))
|
|
2452
|
+
raise ValidationError(self.bundle.get("y_is_empty"))
|
|
2443
2453
|
|
|
2444
2454
|
if (
|
|
2445
2455
|
not isinstance(y, pd.Series)
|
|
@@ -2447,26 +2457,26 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2447
2457
|
and not isinstance(y, np.ndarray)
|
|
2448
2458
|
and not isinstance(y, list)
|
|
2449
2459
|
):
|
|
2450
|
-
raise ValidationError(bundle.get("unsupported_y_type").format(type(y)))
|
|
2460
|
+
raise ValidationError(self.bundle.get("unsupported_y_type").format(type(y)))
|
|
2451
2461
|
|
|
2452
2462
|
if _num_samples(X) != _num_samples(y):
|
|
2453
|
-
raise ValidationError(bundle.get("x_and_y_diff_size").format(_num_samples(X), _num_samples(y)))
|
|
2463
|
+
raise ValidationError(self.bundle.get("x_and_y_diff_size").format(_num_samples(X), _num_samples(y)))
|
|
2454
2464
|
|
|
2455
2465
|
if isinstance(y, pd.DataFrame):
|
|
2456
2466
|
if len(y.columns) != 1:
|
|
2457
|
-
raise ValidationError(bundle.get("y_invalid_dimension_dataframe"))
|
|
2467
|
+
raise ValidationError(self.bundle.get("y_invalid_dimension_dataframe"))
|
|
2458
2468
|
if isinstance(y.columns, pd.MultiIndex) or isinstance(y.index, pd.MultiIndex):
|
|
2459
|
-
raise ValidationError(bundle.get("y_multiindex_unsupported"))
|
|
2469
|
+
raise ValidationError(self.bundle.get("y_multiindex_unsupported"))
|
|
2460
2470
|
y = y[y.columns[0]]
|
|
2461
2471
|
|
|
2462
2472
|
if isinstance(y, pd.Series):
|
|
2463
2473
|
if (y.index != X.index).any():
|
|
2464
|
-
raise ValidationError(bundle.get("x_and_y_diff_index"))
|
|
2474
|
+
raise ValidationError(self.bundle.get("x_and_y_diff_index"))
|
|
2465
2475
|
validated_y = y.copy()
|
|
2466
2476
|
validated_y.rename(TARGET, inplace=True)
|
|
2467
2477
|
elif isinstance(y, np.ndarray):
|
|
2468
2478
|
if y.ndim != 1:
|
|
2469
|
-
raise ValidationError(bundle.get("y_invalid_dimension_array"))
|
|
2479
|
+
raise ValidationError(self.bundle.get("y_invalid_dimension_array"))
|
|
2470
2480
|
Xy = X.copy()
|
|
2471
2481
|
Xy[TARGET] = y
|
|
2472
2482
|
validated_y = Xy[TARGET].copy()
|
|
@@ -2476,24 +2486,24 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2476
2486
|
validated_y = Xy[TARGET].copy()
|
|
2477
2487
|
|
|
2478
2488
|
if validated_y.nunique() < 2:
|
|
2479
|
-
raise ValidationError(bundle.get("y_is_constant"))
|
|
2489
|
+
raise ValidationError(self.bundle.get("y_is_constant"))
|
|
2480
2490
|
|
|
2481
2491
|
return validated_y
|
|
2482
2492
|
|
|
2483
2493
|
def _validate_eval_set_pair(self, X: pd.DataFrame, eval_pair: Tuple) -> Tuple[pd.DataFrame, pd.Series]:
|
|
2484
2494
|
if len(eval_pair) != 2:
|
|
2485
|
-
raise ValidationError(bundle.get("eval_set_invalid_tuple_size").format(len(eval_pair)))
|
|
2495
|
+
raise ValidationError(self.bundle.get("eval_set_invalid_tuple_size").format(len(eval_pair)))
|
|
2486
2496
|
eval_X = eval_pair[0]
|
|
2487
2497
|
eval_y = eval_pair[1]
|
|
2488
2498
|
|
|
2489
2499
|
if _num_samples(eval_X) == 0:
|
|
2490
|
-
raise ValidationError(bundle.get("eval_x_is_empty"))
|
|
2500
|
+
raise ValidationError(self.bundle.get("eval_x_is_empty"))
|
|
2491
2501
|
if _num_samples(eval_y) == 0:
|
|
2492
|
-
raise ValidationError(bundle.get("eval_y_is_empty"))
|
|
2502
|
+
raise ValidationError(self.bundle.get("eval_y_is_empty"))
|
|
2493
2503
|
|
|
2494
2504
|
if isinstance(eval_X, pd.DataFrame):
|
|
2495
2505
|
if isinstance(eval_X.columns, pd.MultiIndex) or isinstance(eval_X.index, pd.MultiIndex):
|
|
2496
|
-
raise ValidationError(bundle.get("eval_x_multiindex_unsupported"))
|
|
2506
|
+
raise ValidationError(self.bundle.get("eval_x_multiindex_unsupported"))
|
|
2497
2507
|
validated_eval_X = eval_X.copy()
|
|
2498
2508
|
elif isinstance(eval_X, pd.Series):
|
|
2499
2509
|
validated_eval_X = eval_X.to_frame()
|
|
@@ -2502,10 +2512,10 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2502
2512
|
renaming = {c: str(c) for c in validated_eval_X.columns}
|
|
2503
2513
|
validated_eval_X = validated_eval_X.rename(columns=renaming)
|
|
2504
2514
|
else:
|
|
2505
|
-
raise ValidationError(bundle.get("unsupported_x_type_eval_set").format(type(eval_X)))
|
|
2515
|
+
raise ValidationError(self.bundle.get("unsupported_x_type_eval_set").format(type(eval_X)))
|
|
2506
2516
|
|
|
2507
2517
|
if not validated_eval_X.index.is_unique:
|
|
2508
|
-
raise ValidationError(bundle.get("x_non_unique_index_eval_set"))
|
|
2518
|
+
raise ValidationError(self.bundle.get("x_non_unique_index_eval_set"))
|
|
2509
2519
|
|
|
2510
2520
|
if self.exclude_columns is not None:
|
|
2511
2521
|
validated_eval_X = validated_eval_X.drop(columns=self.exclude_columns, errors="ignore")
|
|
@@ -2519,28 +2529,30 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2519
2529
|
if set(validated_eval_X.columns.to_list()) == set(X.columns.to_list()):
|
|
2520
2530
|
validated_eval_X = validated_eval_X[X.columns.to_list()]
|
|
2521
2531
|
else:
|
|
2522
|
-
raise ValidationError(bundle.get("eval_x_and_x_diff_shape"))
|
|
2532
|
+
raise ValidationError(self.bundle.get("eval_x_and_x_diff_shape"))
|
|
2523
2533
|
|
|
2524
2534
|
if _num_samples(validated_eval_X) != _num_samples(eval_y):
|
|
2525
2535
|
raise ValidationError(
|
|
2526
|
-
bundle.get("x_and_y_diff_size_eval_set").format(
|
|
2536
|
+
self.bundle.get("x_and_y_diff_size_eval_set").format(
|
|
2537
|
+
_num_samples(validated_eval_X), _num_samples(eval_y)
|
|
2538
|
+
)
|
|
2527
2539
|
)
|
|
2528
2540
|
|
|
2529
2541
|
if isinstance(eval_y, pd.DataFrame):
|
|
2530
2542
|
if len(eval_y.columns) != 1:
|
|
2531
|
-
raise ValidationError(bundle.get("y_invalid_dimension_dataframe_eval_set"))
|
|
2543
|
+
raise ValidationError(self.bundle.get("y_invalid_dimension_dataframe_eval_set"))
|
|
2532
2544
|
if isinstance(eval_y.columns, pd.MultiIndex) or isinstance(eval_y.index, pd.MultiIndex):
|
|
2533
|
-
raise ValidationError(bundle.get("eval_y_multiindex_unsupported"))
|
|
2545
|
+
raise ValidationError(self.bundle.get("eval_y_multiindex_unsupported"))
|
|
2534
2546
|
eval_y = eval_y[eval_y.columns[0]]
|
|
2535
2547
|
|
|
2536
2548
|
if isinstance(eval_y, pd.Series):
|
|
2537
2549
|
if (eval_y.index != validated_eval_X.index).any():
|
|
2538
|
-
raise ValidationError(bundle.get("x_and_y_diff_index_eval_set"))
|
|
2550
|
+
raise ValidationError(self.bundle.get("x_and_y_diff_index_eval_set"))
|
|
2539
2551
|
validated_eval_y = eval_y.copy()
|
|
2540
2552
|
validated_eval_y.rename(TARGET, inplace=True)
|
|
2541
2553
|
elif isinstance(eval_y, np.ndarray):
|
|
2542
2554
|
if eval_y.ndim != 1:
|
|
2543
|
-
raise ValidationError(bundle.get("y_invalid_dimension_array_eval_set"))
|
|
2555
|
+
raise ValidationError(self.bundle.get("y_invalid_dimension_array_eval_set"))
|
|
2544
2556
|
Xy = validated_eval_X.copy()
|
|
2545
2557
|
Xy[TARGET] = eval_y
|
|
2546
2558
|
validated_eval_y = Xy[TARGET].copy()
|
|
@@ -2549,27 +2561,29 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2549
2561
|
Xy[TARGET] = eval_y
|
|
2550
2562
|
validated_eval_y = Xy[TARGET].copy()
|
|
2551
2563
|
else:
|
|
2552
|
-
raise ValidationError(bundle.get("unsupported_y_type_eval_set").format(type(eval_y)))
|
|
2564
|
+
raise ValidationError(self.bundle.get("unsupported_y_type_eval_set").format(type(eval_y)))
|
|
2553
2565
|
|
|
2554
2566
|
if validated_eval_y.nunique() < 2:
|
|
2555
|
-
raise ValidationError(bundle.get("y_is_constant_eval_set"))
|
|
2567
|
+
raise ValidationError(self.bundle.get("y_is_constant_eval_set"))
|
|
2556
2568
|
|
|
2557
2569
|
return validated_eval_X, validated_eval_y
|
|
2558
2570
|
|
|
2559
2571
|
def _validate_baseline_score(self, X: pd.DataFrame, eval_set: Optional[List[Tuple]]):
|
|
2560
2572
|
if self.baseline_score_column is not None:
|
|
2561
2573
|
if self.baseline_score_column not in X.columns:
|
|
2562
|
-
raise ValidationError(
|
|
2574
|
+
raise ValidationError(
|
|
2575
|
+
self.bundle.get("baseline_score_column_not_exists").format(self.baseline_score_column)
|
|
2576
|
+
)
|
|
2563
2577
|
if X[self.baseline_score_column].isna().any():
|
|
2564
|
-
raise ValidationError(bundle.get("baseline_score_column_has_na"))
|
|
2578
|
+
raise ValidationError(self.bundle.get("baseline_score_column_has_na"))
|
|
2565
2579
|
if eval_set is not None:
|
|
2566
2580
|
if isinstance(eval_set, tuple):
|
|
2567
2581
|
eval_set = [eval_set]
|
|
2568
2582
|
for eval in eval_set:
|
|
2569
2583
|
if self.baseline_score_column not in eval[0].columns:
|
|
2570
|
-
raise ValidationError(bundle.get("baseline_score_column_not_exists"))
|
|
2584
|
+
raise ValidationError(self.bundle.get("baseline_score_column_not_exists"))
|
|
2571
2585
|
if eval[0][self.baseline_score_column].isna().any():
|
|
2572
|
-
raise ValidationError(bundle.get("baseline_score_column_has_na"))
|
|
2586
|
+
raise ValidationError(self.bundle.get("baseline_score_column_has_na"))
|
|
2573
2587
|
|
|
2574
2588
|
@staticmethod
|
|
2575
2589
|
def _sample_X_and_y(X: pd.DataFrame, y: pd.Series, enriched_X: pd.DataFrame) -> Tuple[pd.DataFrame, pd.Series]:
|
|
@@ -2853,7 +2867,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2853
2867
|
) -> Tuple[pd.DataFrame, Dict[int, pd.DataFrame]]:
|
|
2854
2868
|
if result_features is None:
|
|
2855
2869
|
self.logger.error(f"result features not found by search_task_id: {self.get_search_id()}")
|
|
2856
|
-
raise RuntimeError(bundle.get("features_wasnt_returned"))
|
|
2870
|
+
raise RuntimeError(self.bundle.get("features_wasnt_returned"))
|
|
2857
2871
|
result_features = (
|
|
2858
2872
|
result_features.drop(columns=EVAL_SET_INDEX)
|
|
2859
2873
|
if EVAL_SET_INDEX in result_features.columns
|
|
@@ -2864,7 +2878,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2864
2878
|
dup_features = [c for c in comparing_columns if c in result_features.columns and c != SYSTEM_RECORD_ID]
|
|
2865
2879
|
if len(dup_features) > 0:
|
|
2866
2880
|
self.logger.warning(f"X contain columns with same name as returned from backend: {dup_features}")
|
|
2867
|
-
raise ValidationError(bundle.get("returned_features_same_as_passed").format(dup_features))
|
|
2881
|
+
raise ValidationError(self.bundle.get("returned_features_same_as_passed").format(dup_features))
|
|
2868
2882
|
|
|
2869
2883
|
# index overrites from result_features
|
|
2870
2884
|
original_index_name = df_with_original_index.index.name
|
|
@@ -2924,10 +2938,10 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2924
2938
|
|
|
2925
2939
|
def __prepare_feature_importances(self, trace_id: str, x_columns: List[str]):
|
|
2926
2940
|
if self._search_task is None:
|
|
2927
|
-
raise NotFittedError(bundle.get("transform_unfitted_enricher"))
|
|
2941
|
+
raise NotFittedError(self.bundle.get("transform_unfitted_enricher"))
|
|
2928
2942
|
features_meta = self._search_task.get_all_features_metadata_v2()
|
|
2929
2943
|
if features_meta is None:
|
|
2930
|
-
raise Exception(bundle.get("missing_features_meta"))
|
|
2944
|
+
raise Exception(self.bundle.get("missing_features_meta"))
|
|
2931
2945
|
|
|
2932
2946
|
original_names_dict = {c.name: c.originalName for c in self._search_task.get_file_metadata(trace_id).columns}
|
|
2933
2947
|
features_df = self._search_task.get_all_initial_raw_features(trace_id, metrics_calculation=True)
|
|
@@ -3017,38 +3031,38 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3017
3031
|
)
|
|
3018
3032
|
features_info.append(
|
|
3019
3033
|
{
|
|
3020
|
-
bundle.get("features_info_name"): feature_name,
|
|
3021
|
-
bundle.get("features_info_shap"): round_shap_value(feature_meta.shap_value),
|
|
3022
|
-
bundle.get("features_info_hitrate"): feature_meta.hit_rate,
|
|
3023
|
-
bundle.get("features_info_value_preview"): feature_sample,
|
|
3024
|
-
bundle.get("features_info_provider"): provider,
|
|
3025
|
-
bundle.get("features_info_source"): source,
|
|
3026
|
-
bundle.get("features_info_commercial_schema"): commercial_schema,
|
|
3034
|
+
self.bundle.get("features_info_name"): feature_name,
|
|
3035
|
+
self.bundle.get("features_info_shap"): round_shap_value(feature_meta.shap_value),
|
|
3036
|
+
self.bundle.get("features_info_hitrate"): feature_meta.hit_rate,
|
|
3037
|
+
self.bundle.get("features_info_value_preview"): feature_sample,
|
|
3038
|
+
self.bundle.get("features_info_provider"): provider,
|
|
3039
|
+
self.bundle.get("features_info_source"): source,
|
|
3040
|
+
self.bundle.get("features_info_commercial_schema"): commercial_schema,
|
|
3027
3041
|
}
|
|
3028
3042
|
)
|
|
3029
3043
|
features_info_without_links.append(
|
|
3030
3044
|
{
|
|
3031
|
-
bundle.get("features_info_name"): internal_feature_name,
|
|
3032
|
-
bundle.get("features_info_shap"): round_shap_value(feature_meta.shap_value),
|
|
3033
|
-
bundle.get("features_info_hitrate"): feature_meta.hit_rate,
|
|
3034
|
-
bundle.get("features_info_value_preview"): feature_sample,
|
|
3035
|
-
bundle.get("features_info_provider"): internal_provider,
|
|
3036
|
-
bundle.get("features_info_source"): internal_source,
|
|
3037
|
-
bundle.get("features_info_commercial_schema"): commercial_schema,
|
|
3045
|
+
self.bundle.get("features_info_name"): internal_feature_name,
|
|
3046
|
+
self.bundle.get("features_info_shap"): round_shap_value(feature_meta.shap_value),
|
|
3047
|
+
self.bundle.get("features_info_hitrate"): feature_meta.hit_rate,
|
|
3048
|
+
self.bundle.get("features_info_value_preview"): feature_sample,
|
|
3049
|
+
self.bundle.get("features_info_provider"): internal_provider,
|
|
3050
|
+
self.bundle.get("features_info_source"): internal_source,
|
|
3051
|
+
self.bundle.get("features_info_commercial_schema"): commercial_schema,
|
|
3038
3052
|
}
|
|
3039
3053
|
)
|
|
3040
3054
|
internal_features_info.append(
|
|
3041
3055
|
{
|
|
3042
|
-
bundle.get("features_info_name"): internal_feature_name,
|
|
3056
|
+
self.bundle.get("features_info_name"): internal_feature_name,
|
|
3043
3057
|
"feature_link": feature_meta.doc_link,
|
|
3044
|
-
bundle.get("features_info_shap"): round_shap_value(feature_meta.shap_value),
|
|
3045
|
-
bundle.get("features_info_hitrate"): feature_meta.hit_rate,
|
|
3046
|
-
bundle.get("features_info_value_preview"): feature_sample,
|
|
3047
|
-
bundle.get("features_info_provider"): internal_provider,
|
|
3058
|
+
self.bundle.get("features_info_shap"): round_shap_value(feature_meta.shap_value),
|
|
3059
|
+
self.bundle.get("features_info_hitrate"): feature_meta.hit_rate,
|
|
3060
|
+
self.bundle.get("features_info_value_preview"): feature_sample,
|
|
3061
|
+
self.bundle.get("features_info_provider"): internal_provider,
|
|
3048
3062
|
"provider_link": feature_meta.data_provider_link,
|
|
3049
|
-
bundle.get("features_info_source"): internal_source,
|
|
3063
|
+
self.bundle.get("features_info_source"): internal_source,
|
|
3050
3064
|
"source_link": feature_meta.data_source_link,
|
|
3051
|
-
bundle.get("features_info_commercial_schema"): feature_meta.commercial_schema or "",
|
|
3065
|
+
self.bundle.get("features_info_commercial_schema"): feature_meta.commercial_schema or "",
|
|
3052
3066
|
}
|
|
3053
3067
|
)
|
|
3054
3068
|
|
|
@@ -3058,8 +3072,10 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3058
3072
|
self._internal_features_info = pd.DataFrame(internal_features_info)
|
|
3059
3073
|
do_without_pandas_limits(lambda: self.logger.info(f"Features info:\n{self._internal_features_info}"))
|
|
3060
3074
|
|
|
3061
|
-
self.relevant_data_sources = self._group_relevant_data_sources(self.features_info)
|
|
3062
|
-
self._relevant_data_sources_wo_links = self._group_relevant_data_sources(
|
|
3075
|
+
self.relevant_data_sources = self._group_relevant_data_sources(self.features_info, self.bundle)
|
|
3076
|
+
self._relevant_data_sources_wo_links = self._group_relevant_data_sources(
|
|
3077
|
+
self._features_info_without_links, self.bundle
|
|
3078
|
+
)
|
|
3063
3079
|
do_without_pandas_limits(
|
|
3064
3080
|
lambda: self.logger.info(f"Relevant data sources:\n{self._relevant_data_sources_wo_links}")
|
|
3065
3081
|
)
|
|
@@ -3119,7 +3135,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3119
3135
|
return None
|
|
3120
3136
|
|
|
3121
3137
|
@staticmethod
|
|
3122
|
-
def _group_relevant_data_sources(df: pd.DataFrame) -> pd.DataFrame:
|
|
3138
|
+
def _group_relevant_data_sources(df: pd.DataFrame, bundle: ResourceBundle) -> pd.DataFrame:
|
|
3123
3139
|
return (
|
|
3124
3140
|
df.query(f"{bundle.get('features_info_provider')} != ''")
|
|
3125
3141
|
.groupby([bundle.get("features_info_provider"), bundle.get("features_info_source")])
|
|
@@ -3174,31 +3190,31 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3174
3190
|
}
|
|
3175
3191
|
passed_unsupported_search_keys = unsupported_search_keys.intersection(search_keys.values())
|
|
3176
3192
|
if len(passed_unsupported_search_keys) > 0:
|
|
3177
|
-
raise ValidationError(bundle.get("unsupported_search_key").format(passed_unsupported_search_keys))
|
|
3193
|
+
raise ValidationError(self.bundle.get("unsupported_search_key").format(passed_unsupported_search_keys))
|
|
3178
3194
|
|
|
3179
3195
|
for column_id, meaning_type in search_keys.items():
|
|
3180
3196
|
column_name = None
|
|
3181
3197
|
if isinstance(column_id, str):
|
|
3182
3198
|
if column_id not in x.columns:
|
|
3183
|
-
raise ValidationError(bundle.get("search_key_not_found").format(column_id, list(x.columns)))
|
|
3199
|
+
raise ValidationError(self.bundle.get("search_key_not_found").format(column_id, list(x.columns)))
|
|
3184
3200
|
column_name = column_id
|
|
3185
3201
|
valid_search_keys[column_name] = meaning_type
|
|
3186
3202
|
elif isinstance(column_id, int):
|
|
3187
3203
|
if column_id >= x.shape[1]:
|
|
3188
|
-
raise ValidationError(bundle.get("numeric_search_key_not_found").format(column_id, x.shape[1]))
|
|
3204
|
+
raise ValidationError(self.bundle.get("numeric_search_key_not_found").format(column_id, x.shape[1]))
|
|
3189
3205
|
column_name = x.columns[column_id]
|
|
3190
3206
|
valid_search_keys[column_name] = meaning_type
|
|
3191
3207
|
else:
|
|
3192
|
-
raise ValidationError(bundle.get("unsupported_search_key_type").format(type(column_id)))
|
|
3208
|
+
raise ValidationError(self.bundle.get("unsupported_search_key_type").format(type(column_id)))
|
|
3193
3209
|
|
|
3194
3210
|
if meaning_type == SearchKey.COUNTRY and self.country_code is not None:
|
|
3195
|
-
msg = bundle.get("search_key_country_and_country_code")
|
|
3211
|
+
msg = self.bundle.get("search_key_country_and_country_code")
|
|
3196
3212
|
self.logger.warning(msg)
|
|
3197
3213
|
print(msg)
|
|
3198
3214
|
self.country_code = None
|
|
3199
3215
|
|
|
3200
3216
|
if not self.__is_registered and not is_demo_dataset and meaning_type in SearchKey.personal_keys():
|
|
3201
|
-
msg = bundle.get("unregistered_with_personal_keys").format(meaning_type)
|
|
3217
|
+
msg = self.bundle.get("unregistered_with_personal_keys").format(meaning_type)
|
|
3202
3218
|
self.logger.warning(msg)
|
|
3203
3219
|
if not silent_mode:
|
|
3204
3220
|
self.warning_counter.increment()
|
|
@@ -3209,7 +3225,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3209
3225
|
if x[column_name].isnull().all() or (
|
|
3210
3226
|
is_string_dtype(x[column_name]) and (x[column_name].astype("string").str.strip() == "").all()
|
|
3211
3227
|
):
|
|
3212
|
-
raise ValidationError(bundle.get("empty_search_key").format(column_name))
|
|
3228
|
+
raise ValidationError(self.bundle.get("empty_search_key").format(column_name))
|
|
3213
3229
|
|
|
3214
3230
|
if self.detect_missing_search_keys and (
|
|
3215
3231
|
not is_transform or set(valid_search_keys.values()) != set(self.fit_search_keys.values())
|
|
@@ -3219,7 +3235,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3219
3235
|
)
|
|
3220
3236
|
|
|
3221
3237
|
if all(k == SearchKey.CUSTOM_KEY for k in valid_search_keys.values()):
|
|
3222
|
-
msg = bundle.get("unregistered_only_personal_keys")
|
|
3238
|
+
msg = self.bundle.get("unregistered_only_personal_keys")
|
|
3223
3239
|
self.logger.warning(msg + f" Provided search keys: {search_keys}")
|
|
3224
3240
|
raise ValidationError(msg)
|
|
3225
3241
|
|
|
@@ -3234,7 +3250,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3234
3250
|
and next(iter(valid_search_keys.values())) == SearchKey.DATE
|
|
3235
3251
|
and not silent_mode
|
|
3236
3252
|
):
|
|
3237
|
-
msg = bundle.get("date_only_search")
|
|
3253
|
+
msg = self.bundle.get("date_only_search")
|
|
3238
3254
|
print(msg)
|
|
3239
3255
|
self.logger.warning(msg)
|
|
3240
3256
|
self.warning_counter.increment()
|
|
@@ -3243,7 +3259,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3243
3259
|
if (self.cv is None or self.cv == CVType.k_fold) and len(maybe_date) > 0 and not silent_mode:
|
|
3244
3260
|
date_column = next(iter(maybe_date))
|
|
3245
3261
|
if x[date_column].nunique() > 0.9 * _num_samples(x):
|
|
3246
|
-
msg = bundle.get("date_search_without_time_series")
|
|
3262
|
+
msg = self.bundle.get("date_search_without_time_series")
|
|
3247
3263
|
print(msg)
|
|
3248
3264
|
self.logger.warning(msg)
|
|
3249
3265
|
self.warning_counter.increment()
|
|
@@ -3252,7 +3268,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3252
3268
|
for k, v in valid_search_keys.items():
|
|
3253
3269
|
# Show warning for country only if country is the only key
|
|
3254
3270
|
if x[k].nunique() == 1 and (v != SearchKey.COUNTRY or len(valid_search_keys) == 1):
|
|
3255
|
-
msg = bundle.get("single_constant_search_key").format(v, x[k].values[0])
|
|
3271
|
+
msg = self.bundle.get("single_constant_search_key").format(v, x[k].values[0])
|
|
3256
3272
|
print(msg)
|
|
3257
3273
|
self.logger.warning(msg)
|
|
3258
3274
|
self.warning_counter.increment()
|
|
@@ -3284,11 +3300,11 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3284
3300
|
progress_callback=progress_callback,
|
|
3285
3301
|
)
|
|
3286
3302
|
if self.metrics is not None:
|
|
3287
|
-
msg = bundle.get("quality_metrics_header")
|
|
3303
|
+
msg = self.bundle.get("quality_metrics_header")
|
|
3288
3304
|
display_html_dataframe(self.metrics, self.metrics, msg)
|
|
3289
3305
|
|
|
3290
3306
|
def __show_selected_features(self, search_keys: Dict[str, SearchKey]):
|
|
3291
|
-
msg = bundle.get("features_info_header").format(len(self.feature_names_), list(search_keys.keys()))
|
|
3307
|
+
msg = self.bundle.get("features_info_header").format(len(self.feature_names_), list(search_keys.keys()))
|
|
3292
3308
|
|
|
3293
3309
|
try:
|
|
3294
3310
|
_ = get_ipython() # type: ignore
|
|
@@ -3297,16 +3313,16 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3297
3313
|
self.logger.info(msg)
|
|
3298
3314
|
if len(self.feature_names_) > 0:
|
|
3299
3315
|
display_html_dataframe(
|
|
3300
|
-
self.features_info, self._features_info_without_links, bundle.get("relevant_features_header")
|
|
3316
|
+
self.features_info, self._features_info_without_links, self.bundle.get("relevant_features_header")
|
|
3301
3317
|
)
|
|
3302
3318
|
|
|
3303
3319
|
display_html_dataframe(
|
|
3304
3320
|
self.relevant_data_sources,
|
|
3305
3321
|
self._relevant_data_sources_wo_links,
|
|
3306
|
-
bundle.get("relevant_data_sources_header"),
|
|
3322
|
+
self.bundle.get("relevant_data_sources_header"),
|
|
3307
3323
|
)
|
|
3308
3324
|
else:
|
|
3309
|
-
msg = bundle.get("features_info_zero_important_features")
|
|
3325
|
+
msg = self.bundle.get("features_info_zero_important_features")
|
|
3310
3326
|
self.logger.warning(msg)
|
|
3311
3327
|
self.__display_support_link(msg)
|
|
3312
3328
|
self.warning_counter.increment()
|
|
@@ -3333,14 +3349,14 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3333
3349
|
return float(importance_threshold) if importance_threshold is not None else 0.0
|
|
3334
3350
|
except ValueError:
|
|
3335
3351
|
self.logger.exception(f"Invalid importance_threshold provided: {importance_threshold}")
|
|
3336
|
-
raise ValidationError(bundle.get("invalid_importance_threshold"))
|
|
3352
|
+
raise ValidationError(self.bundle.get("invalid_importance_threshold"))
|
|
3337
3353
|
|
|
3338
3354
|
def __validate_max_features(self, max_features: Optional[int]) -> int:
|
|
3339
3355
|
try:
|
|
3340
3356
|
return int(max_features) if max_features is not None else 400
|
|
3341
3357
|
except ValueError:
|
|
3342
3358
|
self.logger.exception(f"Invalid max_features provided: {max_features}")
|
|
3343
|
-
raise ValidationError(bundle.get("invalid_max_features"))
|
|
3359
|
+
raise ValidationError(self.bundle.get("invalid_max_features"))
|
|
3344
3360
|
|
|
3345
3361
|
def __filtered_enriched_features(
|
|
3346
3362
|
self,
|
|
@@ -3372,7 +3388,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3372
3388
|
self.autodetected_search_keys[maybe_key] = SearchKey.POSTAL_CODE
|
|
3373
3389
|
self.logger.info(f"Autodetected search key POSTAL_CODE in column {maybe_key}")
|
|
3374
3390
|
if not silent_mode:
|
|
3375
|
-
print(bundle.get("postal_code_detected").format(maybe_key))
|
|
3391
|
+
print(self.bundle.get("postal_code_detected").format(maybe_key))
|
|
3376
3392
|
|
|
3377
3393
|
if (
|
|
3378
3394
|
SearchKey.COUNTRY not in search_keys.values()
|
|
@@ -3385,7 +3401,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3385
3401
|
self.autodetected_search_keys[maybe_key] = SearchKey.COUNTRY
|
|
3386
3402
|
self.logger.info(f"Autodetected search key COUNTRY in column {maybe_key}")
|
|
3387
3403
|
if not silent_mode:
|
|
3388
|
-
print(bundle.get("country_detected").format(maybe_key))
|
|
3404
|
+
print(self.bundle.get("country_detected").format(maybe_key))
|
|
3389
3405
|
|
|
3390
3406
|
if (
|
|
3391
3407
|
SearchKey.EMAIL not in search_keys.values()
|
|
@@ -3399,13 +3415,13 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3399
3415
|
self.autodetected_search_keys[maybe_key] = SearchKey.EMAIL
|
|
3400
3416
|
self.logger.info(f"Autodetected search key EMAIL in column {maybe_key}")
|
|
3401
3417
|
if not silent_mode:
|
|
3402
|
-
print(bundle.get("email_detected").format(maybe_key))
|
|
3418
|
+
print(self.bundle.get("email_detected").format(maybe_key))
|
|
3403
3419
|
else:
|
|
3404
3420
|
self.logger.warning(
|
|
3405
3421
|
f"Autodetected search key EMAIL in column {maybe_key}. But not used because not registered user"
|
|
3406
3422
|
)
|
|
3407
3423
|
if not silent_mode:
|
|
3408
|
-
print(bundle.get("email_detected_not_registered").format(maybe_key))
|
|
3424
|
+
print(self.bundle.get("email_detected_not_registered").format(maybe_key))
|
|
3409
3425
|
self.warning_counter.increment()
|
|
3410
3426
|
|
|
3411
3427
|
if SearchKey.PHONE not in search_keys.values() and check_need_detect(SearchKey.PHONE):
|
|
@@ -3416,20 +3432,20 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3416
3432
|
self.autodetected_search_keys[maybe_key] = SearchKey.PHONE
|
|
3417
3433
|
self.logger.info(f"Autodetected search key PHONE in column {maybe_key}")
|
|
3418
3434
|
if not silent_mode:
|
|
3419
|
-
print(bundle.get("phone_detected").format(maybe_key))
|
|
3435
|
+
print(self.bundle.get("phone_detected").format(maybe_key))
|
|
3420
3436
|
else:
|
|
3421
3437
|
self.logger.warning(
|
|
3422
3438
|
f"Autodetected search key PHONE in column {maybe_key}. But not used because not registered user"
|
|
3423
3439
|
)
|
|
3424
3440
|
if not silent_mode:
|
|
3425
|
-
print(bundle.get("phone_detected_not_registered"))
|
|
3441
|
+
print(self.bundle.get("phone_detected_not_registered"))
|
|
3426
3442
|
self.warning_counter.increment()
|
|
3427
3443
|
|
|
3428
3444
|
return search_keys
|
|
3429
3445
|
|
|
3430
3446
|
def _validate_binary_observations(self, y, task_type: ModelTaskType):
|
|
3431
3447
|
if task_type == ModelTaskType.BINARY and (y.value_counts() < 1000).any():
|
|
3432
|
-
msg = bundle.get("binary_small_dataset")
|
|
3448
|
+
msg = self.bundle.get("binary_small_dataset")
|
|
3433
3449
|
self.logger.warning(msg)
|
|
3434
3450
|
print(msg)
|
|
3435
3451
|
|
|
@@ -3444,8 +3460,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3444
3460
|
self.logger.exception("Failed to dump python libs")
|
|
3445
3461
|
|
|
3446
3462
|
def __display_support_link(self, link_text: Optional[str] = None):
|
|
3447
|
-
support_link = bundle.get("support_link")
|
|
3448
|
-
link_text = link_text or bundle.get("support_text")
|
|
3463
|
+
support_link = self.bundle.get("support_link")
|
|
3464
|
+
link_text = link_text or self.bundle.get("support_text")
|
|
3449
3465
|
try:
|
|
3450
3466
|
from IPython.display import HTML, display
|
|
3451
3467
|
|
|
@@ -3561,7 +3577,7 @@ def _num_samples(x):
|
|
|
3561
3577
|
raise TypeError(message) from type_error
|
|
3562
3578
|
|
|
3563
3579
|
|
|
3564
|
-
def is_frames_equal(first, second) -> bool:
|
|
3580
|
+
def is_frames_equal(first, second, bundle: ResourceBundle) -> bool:
|
|
3565
3581
|
if (isinstance(first, pd.DataFrame) and isinstance(second, pd.DataFrame)) or (
|
|
3566
3582
|
isinstance(first, pd.Series) and isinstance(second, pd.Series)
|
|
3567
3583
|
):
|