upgini 1.1.244a25__py3-none-any.whl → 1.1.245a1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- upgini/dataset.py +59 -53
- upgini/features_enricher.py +198 -185
- upgini/metrics.py +1 -0
- upgini/resource_bundle/__init__.py +14 -1
- upgini/utils/target_utils.py +8 -2
- {upgini-1.1.244a25.dist-info → upgini-1.1.245a1.dist-info}/METADATA +7 -7
- {upgini-1.1.244a25.dist-info → upgini-1.1.245a1.dist-info}/RECORD +10 -10
- {upgini-1.1.244a25.dist-info → upgini-1.1.245a1.dist-info}/WHEEL +1 -1
- {upgini-1.1.244a25.dist-info → upgini-1.1.245a1.dist-info}/LICENSE +0 -0
- {upgini-1.1.244a25.dist-info → upgini-1.1.245a1.dist-info}/top_level.txt +0 -0
upgini/features_enricher.py
CHANGED
|
@@ -50,7 +50,7 @@ from upgini.metadata import (
|
|
|
50
50
|
SearchKey,
|
|
51
51
|
)
|
|
52
52
|
from upgini.metrics import EstimatorWrapper, validate_scoring_argument
|
|
53
|
-
from upgini.resource_bundle import bundle
|
|
53
|
+
from upgini.resource_bundle import ResourceBundle, get_custom_bundle, bundle
|
|
54
54
|
from upgini.search_task import SearchTask
|
|
55
55
|
from upgini.spinner import Spinner
|
|
56
56
|
from upgini.utils import combine_search_keys
|
|
@@ -186,8 +186,10 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
186
186
|
baseline_score_column: Optional[Any] = None,
|
|
187
187
|
client_ip: Optional[str] = None,
|
|
188
188
|
client_visitorid: Optional[str] = None,
|
|
189
|
+
custom_bundle_config: Optional[str] = None,
|
|
189
190
|
**kwargs,
|
|
190
191
|
):
|
|
192
|
+
self.bundle = get_custom_bundle(custom_bundle_config)
|
|
191
193
|
self._api_key = api_key or os.environ.get(UPGINI_API_KEY)
|
|
192
194
|
if api_key is not None and not isinstance(api_key, str):
|
|
193
195
|
raise ValidationError(f"api_key should be `string`, but passed: `{api_key}`")
|
|
@@ -240,23 +242,23 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
240
242
|
if search_id:
|
|
241
243
|
search_task = SearchTask(search_id, rest_client=self.rest_client, logger=self.logger)
|
|
242
244
|
|
|
243
|
-
print(bundle.get("search_by_task_id_start"))
|
|
245
|
+
print(self.bundle.get("search_by_task_id_start"))
|
|
244
246
|
trace_id = str(uuid.uuid4())
|
|
245
247
|
with MDC(trace_id=trace_id):
|
|
246
248
|
try:
|
|
247
|
-
self.logger.
|
|
249
|
+
self.logger.debug(f"FeaturesEnricher created from existing search: {search_id}")
|
|
248
250
|
self._search_task = search_task.poll_result(trace_id, quiet=True, check_fit=True)
|
|
249
251
|
file_metadata = self._search_task.get_file_metadata(trace_id)
|
|
250
252
|
x_columns = [c.originalName or c.name for c in file_metadata.columns]
|
|
251
253
|
self.__prepare_feature_importances(trace_id, x_columns)
|
|
252
254
|
# TODO validate search_keys with search_keys from file_metadata
|
|
253
|
-
print(bundle.get("search_by_task_id_finish"))
|
|
254
|
-
self.logger.
|
|
255
|
+
print(self.bundle.get("search_by_task_id_finish"))
|
|
256
|
+
self.logger.debug(f"Successfully initialized with search_id: {search_id}")
|
|
255
257
|
except HttpError as e:
|
|
256
258
|
if "Interrupted by client" in e.args[0]:
|
|
257
259
|
raise ValidationError("Search was cancelled")
|
|
258
260
|
except Exception as e:
|
|
259
|
-
print(bundle.get("failed_search_by_task_id"))
|
|
261
|
+
print(self.bundle.get("failed_search_by_task_id"))
|
|
260
262
|
self.logger.exception(f"Failed to find search_id: {search_id}")
|
|
261
263
|
raise e
|
|
262
264
|
|
|
@@ -277,13 +279,13 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
277
279
|
self.round_embeddings = round_embeddings
|
|
278
280
|
if generate_features is not None:
|
|
279
281
|
if len(generate_features) > self.GENERATE_FEATURES_LIMIT:
|
|
280
|
-
msg = bundle.get("too_many_generate_features").format(self.GENERATE_FEATURES_LIMIT)
|
|
282
|
+
msg = self.bundle.get("too_many_generate_features").format(self.GENERATE_FEATURES_LIMIT)
|
|
281
283
|
self.logger.error(msg)
|
|
282
284
|
raise ValidationError(msg)
|
|
283
285
|
self.runtime_parameters.properties["generate_features"] = ",".join(generate_features)
|
|
284
286
|
if round_embeddings is not None:
|
|
285
287
|
if not isinstance(round_embeddings, int) or round_embeddings < 0:
|
|
286
|
-
msg = bundle.get("invalid_round_embeddings")
|
|
288
|
+
msg = self.bundle.get("invalid_round_embeddings")
|
|
287
289
|
self.logger.error(msg)
|
|
288
290
|
raise ValidationError(msg)
|
|
289
291
|
self.runtime_parameters.properties["round_embeddings"] = round_embeddings
|
|
@@ -309,7 +311,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
309
311
|
api_key = property(_get_api_key, _set_api_key)
|
|
310
312
|
|
|
311
313
|
@staticmethod
|
|
312
|
-
def _check_eval_set(eval_set, X):
|
|
314
|
+
def _check_eval_set(eval_set, X, bundle: ResourceBundle):
|
|
313
315
|
checked_eval_set = []
|
|
314
316
|
if eval_set is not None and isinstance(eval_set, tuple):
|
|
315
317
|
eval_set = [eval_set]
|
|
@@ -318,7 +320,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
318
320
|
for eval_pair in eval_set or []:
|
|
319
321
|
if not isinstance(eval_pair, tuple) or len(eval_pair) != 2:
|
|
320
322
|
raise ValidationError(bundle.get("eval_set_invalid_tuple_size").format(len(eval_pair)))
|
|
321
|
-
if not is_frames_equal(X, eval_pair[0]):
|
|
323
|
+
if not is_frames_equal(X, eval_pair[0], bundle):
|
|
322
324
|
checked_eval_set.append(eval_pair)
|
|
323
325
|
return checked_eval_set
|
|
324
326
|
|
|
@@ -401,7 +403,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
401
403
|
try:
|
|
402
404
|
self.X = X
|
|
403
405
|
self.y = y
|
|
404
|
-
self.eval_set = self._check_eval_set(eval_set, X)
|
|
406
|
+
self.eval_set = self._check_eval_set(eval_set, X, self.bundle)
|
|
405
407
|
self.dump_input(trace_id, X, y, eval_set)
|
|
406
408
|
self.__inner_fit(
|
|
407
409
|
trace_id,
|
|
@@ -439,7 +441,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
439
441
|
if len(e.args) > 0 and (
|
|
440
442
|
"File doesn't intersect with any ADS" in str(e.args[0]) or "Empty intersection" in str(e.args[0])
|
|
441
443
|
):
|
|
442
|
-
self.__display_support_link(bundle.get("features_info_zero_important_features"))
|
|
444
|
+
self.__display_support_link(self.bundle.get("features_info_zero_important_features"))
|
|
443
445
|
elif isinstance(e, ValidationError):
|
|
444
446
|
self._dump_python_libs()
|
|
445
447
|
self._show_error(str(e))
|
|
@@ -540,11 +542,13 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
540
542
|
try:
|
|
541
543
|
self.X = X
|
|
542
544
|
self.y = y
|
|
543
|
-
self.eval_set = self._check_eval_set(eval_set, X)
|
|
545
|
+
self.eval_set = self._check_eval_set(eval_set, X, self.bundle)
|
|
544
546
|
self.dump_input(trace_id, X, y, eval_set)
|
|
545
547
|
|
|
546
548
|
if _num_samples(drop_duplicates(X)) > Dataset.MAX_ROWS:
|
|
547
|
-
raise ValidationError(
|
|
549
|
+
raise ValidationError(
|
|
550
|
+
self.bundle.get("dataset_too_many_rows_registered").format(Dataset.MAX_ROWS)
|
|
551
|
+
)
|
|
548
552
|
|
|
549
553
|
self.__inner_fit(
|
|
550
554
|
trace_id,
|
|
@@ -581,7 +585,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
581
585
|
if len(e.args) > 0 and (
|
|
582
586
|
"File doesn't intersect with any ADS" in str(e.args[0]) or "Empty intersection" in str(e.args[0])
|
|
583
587
|
):
|
|
584
|
-
self.__display_support_link(bundle.get("features_info_zero_important_features"))
|
|
588
|
+
self.__display_support_link(self.bundle.get("features_info_zero_important_features"))
|
|
585
589
|
return None
|
|
586
590
|
elif isinstance(e, ValidationError):
|
|
587
591
|
self._dump_python_libs()
|
|
@@ -677,11 +681,11 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
677
681
|
self.__validate_search_keys(self.search_keys, self.search_id)
|
|
678
682
|
try:
|
|
679
683
|
if len(self.feature_names_) == 0:
|
|
680
|
-
self.logger.warning(bundle.get("no_important_features_for_transform"))
|
|
684
|
+
self.logger.warning(self.bundle.get("no_important_features_for_transform"))
|
|
681
685
|
return X
|
|
682
686
|
|
|
683
687
|
if self._has_paid_features(exclude_features_sources):
|
|
684
|
-
msg = bundle.get("transform_with_paid_features")
|
|
688
|
+
msg = self.bundle.get("transform_with_paid_features")
|
|
685
689
|
self.logger.warning(msg)
|
|
686
690
|
self.__display_support_link(msg)
|
|
687
691
|
return None
|
|
@@ -691,13 +695,15 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
691
695
|
self.logger.info(f"Current transform usage: {transform_usage}. Transforming {len(X)} rows")
|
|
692
696
|
if transform_usage.has_limit:
|
|
693
697
|
if len(X) > transform_usage.rest_rows:
|
|
694
|
-
msg = bundle.get("transform_usage_warning").format(
|
|
698
|
+
msg = self.bundle.get("transform_usage_warning").format(
|
|
699
|
+
len(X), transform_usage.rest_rows
|
|
700
|
+
)
|
|
695
701
|
self.logger.warning(msg)
|
|
696
702
|
print(msg)
|
|
697
703
|
show_request_quote_button()
|
|
698
704
|
return None
|
|
699
705
|
else:
|
|
700
|
-
msg = bundle.get("transform_usage_info").format(
|
|
706
|
+
msg = self.bundle.get("transform_usage_info").format(
|
|
701
707
|
transform_usage.limit, transform_usage.transformed_rows
|
|
702
708
|
)
|
|
703
709
|
self.logger.info("transform_usage_warning")
|
|
@@ -735,13 +741,13 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
735
741
|
if len(e.args) > 0 and (
|
|
736
742
|
"File doesn't intersect with any ADS" in str(e.args[0]) or "Empty intersection" in str(e.args[0])
|
|
737
743
|
):
|
|
738
|
-
self.__display_support_link(bundle.get("features_info_zero_important_features"))
|
|
744
|
+
self.__display_support_link(self.bundle.get("features_info_zero_important_features"))
|
|
739
745
|
return None
|
|
740
746
|
elif len(e.args) > 0 and (
|
|
741
747
|
"You have reached the quota limit of trial data usage" in str(e.args[0])
|
|
742
748
|
or "Current user hasn't access to trial features" in str(e.args[0])
|
|
743
749
|
):
|
|
744
|
-
self.__display_support_link(bundle.get("trial_quota_limit_riched"))
|
|
750
|
+
self.__display_support_link(self.bundle.get("trial_quota_limit_riched"))
|
|
745
751
|
return None
|
|
746
752
|
elif isinstance(e, ValidationError):
|
|
747
753
|
self._dump_python_libs()
|
|
@@ -858,7 +864,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
858
864
|
or (self.X is None and X is None)
|
|
859
865
|
or (self.y is None and y is None)
|
|
860
866
|
):
|
|
861
|
-
raise ValidationError(bundle.get("metrics_unfitted_enricher"))
|
|
867
|
+
raise ValidationError(self.bundle.get("metrics_unfitted_enricher"))
|
|
862
868
|
|
|
863
869
|
if X is not None and y is None:
|
|
864
870
|
raise ValidationError("X passed without y")
|
|
@@ -866,18 +872,12 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
866
872
|
effective_X = X if X is not None else self.X
|
|
867
873
|
effective_eval_set = eval_set if eval_set is not None else self.eval_set
|
|
868
874
|
|
|
869
|
-
effective_X = X if X is not None else self.X
|
|
870
|
-
effective_eval_set = eval_set if eval_set is not None else self.eval_set
|
|
871
|
-
|
|
872
|
-
effective_X = X if X is not None else self.X
|
|
873
|
-
effective_eval_set = eval_set if eval_set is not None else self.eval_set
|
|
874
|
-
|
|
875
875
|
validate_scoring_argument(scoring)
|
|
876
876
|
|
|
877
877
|
self._validate_baseline_score(effective_X, effective_eval_set)
|
|
878
878
|
|
|
879
879
|
if self._has_paid_features(exclude_features_sources):
|
|
880
|
-
msg = bundle.get("metrics_with_paid_features")
|
|
880
|
+
msg = self.bundle.get("metrics_with_paid_features")
|
|
881
881
|
self.logger.warning(msg)
|
|
882
882
|
self.__display_support_link(msg)
|
|
883
883
|
return None
|
|
@@ -898,7 +898,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
898
898
|
if self.search_keys[cat_feature] in [SearchKey.COUNTRY, SearchKey.POSTAL_CODE]:
|
|
899
899
|
search_keys_for_metrics.append(cat_feature)
|
|
900
900
|
else:
|
|
901
|
-
raise ValidationError(bundle.get("cat_feature_search_key").format(cat_feature))
|
|
901
|
+
raise ValidationError(self.bundle.get("cat_feature_search_key").format(cat_feature))
|
|
902
902
|
|
|
903
903
|
prepared_data = self._prepare_data_for_metrics(
|
|
904
904
|
trace_id=trace_id,
|
|
@@ -928,10 +928,10 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
928
928
|
|
|
929
929
|
gc.collect()
|
|
930
930
|
|
|
931
|
-
print(bundle.get("metrics_start"))
|
|
931
|
+
print(self.bundle.get("metrics_start"))
|
|
932
932
|
with Spinner():
|
|
933
933
|
if fitting_X.shape[1] == 0 and fitting_enriched_X.shape[1] == 0:
|
|
934
|
-
print(bundle.get("metrics_no_important_free_features"))
|
|
934
|
+
print(self.bundle.get("metrics_no_important_free_features"))
|
|
935
935
|
self.logger.warning("No client or free relevant ADS features found to calculate metrics")
|
|
936
936
|
self.warning_counter.increment()
|
|
937
937
|
return None
|
|
@@ -1025,20 +1025,25 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1025
1025
|
effective_X = X if X is not None else self.X
|
|
1026
1026
|
effective_y = y if y is not None else self.y
|
|
1027
1027
|
train_metrics = {
|
|
1028
|
-
bundle.get("quality_metrics_segment_header"): bundle.get(
|
|
1029
|
-
|
|
1030
|
-
|
|
1028
|
+
self.bundle.get("quality_metrics_segment_header"): self.bundle.get(
|
|
1029
|
+
"quality_metrics_train_segment"
|
|
1030
|
+
),
|
|
1031
|
+
self.bundle.get("quality_metrics_rows_header"): _num_samples(effective_X),
|
|
1031
1032
|
}
|
|
1032
1033
|
if model_task_type in [ModelTaskType.BINARY, ModelTaskType.REGRESSION] and is_numeric_dtype(
|
|
1033
1034
|
y_sorted
|
|
1034
1035
|
):
|
|
1035
|
-
train_metrics[bundle.get("quality_metrics_mean_target_header")] = round(
|
|
1036
|
+
train_metrics[self.bundle.get("quality_metrics_mean_target_header")] = round(
|
|
1037
|
+
np.mean(effective_y), 4
|
|
1038
|
+
)
|
|
1036
1039
|
if etalon_metric is not None:
|
|
1037
|
-
train_metrics[bundle.get("quality_metrics_baseline_header").format(metric)] = etalon_metric
|
|
1040
|
+
train_metrics[self.bundle.get("quality_metrics_baseline_header").format(metric)] = etalon_metric
|
|
1038
1041
|
if enriched_metric is not None:
|
|
1039
|
-
train_metrics[
|
|
1042
|
+
train_metrics[
|
|
1043
|
+
self.bundle.get("quality_metrics_enriched_header").format(metric)
|
|
1044
|
+
] = enriched_metric
|
|
1040
1045
|
if uplift is not None:
|
|
1041
|
-
train_metrics[bundle.get("quality_metrics_uplift_header")] = uplift
|
|
1046
|
+
train_metrics[self.bundle.get("quality_metrics_uplift_header")] = uplift
|
|
1042
1047
|
metrics = [train_metrics]
|
|
1043
1048
|
|
|
1044
1049
|
# 3 If eval_set is presented - fit final model on train enriched data and score each
|
|
@@ -1090,40 +1095,42 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1090
1095
|
|
|
1091
1096
|
effective_eval_set = eval_set if eval_set is not None else self.eval_set
|
|
1092
1097
|
eval_metrics = {
|
|
1093
|
-
bundle.get("quality_metrics_segment_header"): bundle.get(
|
|
1098
|
+
self.bundle.get("quality_metrics_segment_header"): self.bundle.get(
|
|
1094
1099
|
"quality_metrics_eval_segment"
|
|
1095
1100
|
).format(idx + 1),
|
|
1096
|
-
bundle.get("quality_metrics_rows_header"): _num_samples(
|
|
1097
|
-
|
|
1101
|
+
self.bundle.get("quality_metrics_rows_header"): _num_samples(
|
|
1102
|
+
effective_eval_set[idx][0]
|
|
1103
|
+
),
|
|
1104
|
+
# self.bundle.get("quality_metrics_match_rate_header"): eval_hit_rate,
|
|
1098
1105
|
}
|
|
1099
1106
|
if model_task_type in [ModelTaskType.BINARY, ModelTaskType.REGRESSION] and is_numeric_dtype(
|
|
1100
1107
|
eval_y_sorted
|
|
1101
1108
|
):
|
|
1102
|
-
eval_metrics[bundle.get("quality_metrics_mean_target_header")] = round(
|
|
1109
|
+
eval_metrics[self.bundle.get("quality_metrics_mean_target_header")] = round(
|
|
1103
1110
|
np.mean(effective_eval_set[idx][1]), 4
|
|
1104
1111
|
)
|
|
1105
1112
|
if etalon_eval_metric is not None:
|
|
1106
1113
|
eval_metrics[
|
|
1107
|
-
bundle.get("quality_metrics_baseline_header").format(metric)
|
|
1114
|
+
self.bundle.get("quality_metrics_baseline_header").format(metric)
|
|
1108
1115
|
] = etalon_eval_metric
|
|
1109
1116
|
if enriched_eval_metric is not None:
|
|
1110
1117
|
eval_metrics[
|
|
1111
|
-
bundle.get("quality_metrics_enriched_header").format(metric)
|
|
1118
|
+
self.bundle.get("quality_metrics_enriched_header").format(metric)
|
|
1112
1119
|
] = enriched_eval_metric
|
|
1113
1120
|
if eval_uplift is not None:
|
|
1114
|
-
eval_metrics[bundle.get("quality_metrics_uplift_header")] = eval_uplift
|
|
1121
|
+
eval_metrics[self.bundle.get("quality_metrics_uplift_header")] = eval_uplift
|
|
1115
1122
|
|
|
1116
1123
|
metrics.append(eval_metrics)
|
|
1117
1124
|
|
|
1118
1125
|
metrics_df = pd.DataFrame(metrics)
|
|
1119
|
-
mean_target_hdr = bundle.get("quality_metrics_mean_target_header")
|
|
1126
|
+
mean_target_hdr = self.bundle.get("quality_metrics_mean_target_header")
|
|
1120
1127
|
if mean_target_hdr in metrics_df.columns:
|
|
1121
1128
|
metrics_df[mean_target_hdr] = metrics_df[mean_target_hdr].astype("float64")
|
|
1122
1129
|
do_without_pandas_limits(
|
|
1123
1130
|
lambda: self.logger.info(f"Metrics calculation finished successfully:\n{metrics_df}")
|
|
1124
1131
|
)
|
|
1125
1132
|
|
|
1126
|
-
uplift_col = bundle.get("quality_metrics_uplift_header")
|
|
1133
|
+
uplift_col = self.bundle.get("quality_metrics_uplift_header")
|
|
1127
1134
|
date_column = self._get_date_column(search_keys)
|
|
1128
1135
|
if (
|
|
1129
1136
|
uplift_col in metrics_df.columns
|
|
@@ -1133,7 +1140,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1133
1140
|
and date_column is not None
|
|
1134
1141
|
and is_time_series(validated_X, date_column)
|
|
1135
1142
|
):
|
|
1136
|
-
msg = bundle.get("metrics_negative_uplift_without_cv")
|
|
1143
|
+
msg = self.bundle.get("metrics_negative_uplift_without_cv")
|
|
1137
1144
|
self.logger.warning(msg)
|
|
1138
1145
|
self.__display_support_link(msg)
|
|
1139
1146
|
elif uplift_col in metrics_df.columns and (metrics_df[uplift_col] < 0).any():
|
|
@@ -1149,7 +1156,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1149
1156
|
"You have reached the quota limit of trial data usage" in str(e.args[0])
|
|
1150
1157
|
or "Current user hasn't access to trial features" in str(e.args[0])
|
|
1151
1158
|
):
|
|
1152
|
-
self.__display_support_link(bundle.get("trial_quota_limit_riched"))
|
|
1159
|
+
self.__display_support_link(self.bundle.get("trial_quota_limit_riched"))
|
|
1153
1160
|
elif isinstance(e, ValidationError):
|
|
1154
1161
|
self._dump_python_libs()
|
|
1155
1162
|
self._show_error(str(e))
|
|
@@ -1171,7 +1178,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1171
1178
|
if res[1] < 0.05:
|
|
1172
1179
|
uneven_distribution = True
|
|
1173
1180
|
if uneven_distribution:
|
|
1174
|
-
msg = bundle.get("uneven_eval_target_distribution")
|
|
1181
|
+
msg = self.bundle.get("uneven_eval_target_distribution")
|
|
1175
1182
|
print(msg)
|
|
1176
1183
|
self.logger.warning(msg)
|
|
1177
1184
|
|
|
@@ -1185,14 +1192,14 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1185
1192
|
) -> List[str]:
|
|
1186
1193
|
if exclude_features_sources:
|
|
1187
1194
|
filtered_features_info = self.features_info[
|
|
1188
|
-
~self.features_info[bundle.get("features_info_name")].isin(exclude_features_sources)
|
|
1195
|
+
~self.features_info[self.bundle.get("features_info_name")].isin(exclude_features_sources)
|
|
1189
1196
|
]
|
|
1190
1197
|
else:
|
|
1191
1198
|
filtered_features_info = self.features_info
|
|
1192
1199
|
return list(
|
|
1193
1200
|
filtered_features_info.loc[
|
|
1194
|
-
filtered_features_info[bundle.get("features_info_commercial_schema")] == commercial_schema,
|
|
1195
|
-
bundle.get("features_info_name"),
|
|
1201
|
+
filtered_features_info[self.bundle.get("features_info_commercial_schema")] == commercial_schema,
|
|
1202
|
+
self.bundle.get("features_info_name"),
|
|
1196
1203
|
].values
|
|
1197
1204
|
)
|
|
1198
1205
|
|
|
@@ -1239,7 +1246,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1239
1246
|
if X is None:
|
|
1240
1247
|
return True, self.X, self.y, self.eval_set
|
|
1241
1248
|
|
|
1242
|
-
checked_eval_set = self._check_eval_set(eval_set, X)
|
|
1249
|
+
checked_eval_set = self._check_eval_set(eval_set, X, self.bundle)
|
|
1243
1250
|
|
|
1244
1251
|
if (
|
|
1245
1252
|
X is self.X
|
|
@@ -1280,7 +1287,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1280
1287
|
is_demo_dataset = hash_input(X, y, eval_set) in DEMO_DATASET_HASHES
|
|
1281
1288
|
validated_X = self._validate_X(X)
|
|
1282
1289
|
validated_y = self._validate_y(validated_X, y)
|
|
1283
|
-
checked_eval_set = self._check_eval_set(eval_set, X)
|
|
1290
|
+
checked_eval_set = self._check_eval_set(eval_set, X, self.bundle)
|
|
1284
1291
|
validated_eval_set = (
|
|
1285
1292
|
[self._validate_eval_set_pair(validated_X, eval_set_pair) for eval_set_pair in checked_eval_set]
|
|
1286
1293
|
if checked_eval_set
|
|
@@ -1409,7 +1416,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1409
1416
|
return self.__sample_balanced(eval_set, trace_id, remove_outliers_calc_metrics)
|
|
1410
1417
|
else:
|
|
1411
1418
|
self.logger.info("Dataset is imbalanced or exclude_features_sources or X was passed. Run transform")
|
|
1412
|
-
print(bundle.get("prepare_data_for_metrics"))
|
|
1419
|
+
print(self.bundle.get("prepare_data_for_metrics"))
|
|
1413
1420
|
return self.__sample_imbalanced(
|
|
1414
1421
|
validated_X,
|
|
1415
1422
|
validated_y,
|
|
@@ -1503,7 +1510,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1503
1510
|
not_msg = ""
|
|
1504
1511
|
else:
|
|
1505
1512
|
not_msg = "not "
|
|
1506
|
-
msg = bundle.get("target_outliers_warning").format(len(target_outliers_df), top_outliers, not_msg)
|
|
1513
|
+
msg = self.bundle.get("target_outliers_warning").format(len(target_outliers_df), top_outliers, not_msg)
|
|
1507
1514
|
print(msg)
|
|
1508
1515
|
self.logger.warning(msg)
|
|
1509
1516
|
|
|
@@ -1529,7 +1536,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1529
1536
|
if eval_set is not None:
|
|
1530
1537
|
if len(enriched_eval_sets) != len(eval_set):
|
|
1531
1538
|
raise ValidationError(
|
|
1532
|
-
bundle.get("metrics_eval_set_count_diff").format(len(enriched_eval_sets), len(eval_set))
|
|
1539
|
+
self.bundle.get("metrics_eval_set_count_diff").format(len(enriched_eval_sets), len(eval_set))
|
|
1533
1540
|
)
|
|
1534
1541
|
|
|
1535
1542
|
for idx in range(len(eval_set)):
|
|
@@ -1680,7 +1687,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1680
1687
|
def get_features_info(self) -> pd.DataFrame:
|
|
1681
1688
|
"""Returns pandas.DataFrame with SHAP values and other info for each feature."""
|
|
1682
1689
|
if self._search_task is None or self._search_task.summary is None:
|
|
1683
|
-
msg = bundle.get("features_unfitted_enricher")
|
|
1690
|
+
msg = self.bundle.get("features_unfitted_enricher")
|
|
1684
1691
|
self.logger.warning(msg)
|
|
1685
1692
|
raise NotFittedError(msg)
|
|
1686
1693
|
|
|
@@ -1694,9 +1701,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1694
1701
|
|
|
1695
1702
|
def get_transactional_transform_api(self):
|
|
1696
1703
|
if self.api_key is None:
|
|
1697
|
-
raise ValidationError(bundle.get("transactional_transform_unregistered"))
|
|
1704
|
+
raise ValidationError(self.bundle.get("transactional_transform_unregistered"))
|
|
1698
1705
|
if self._search_task is None:
|
|
1699
|
-
raise ValidationError(bundle.get("transactional_transform_unfited"))
|
|
1706
|
+
raise ValidationError(self.bundle.get("transactional_transform_unfited"))
|
|
1700
1707
|
|
|
1701
1708
|
def key_example(key: SearchKey):
|
|
1702
1709
|
if key == SearchKey.COUNTRY:
|
|
@@ -1761,7 +1768,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1761
1768
|
) -> pd.DataFrame:
|
|
1762
1769
|
with MDC(trace_id=trace_id):
|
|
1763
1770
|
if self._search_task is None:
|
|
1764
|
-
raise NotFittedError(bundle.get("transform_unfitted_enricher"))
|
|
1771
|
+
raise NotFittedError(self.bundle.get("transform_unfitted_enricher"))
|
|
1765
1772
|
|
|
1766
1773
|
validated_X = self._validate_X(X, is_transform=True)
|
|
1767
1774
|
|
|
@@ -1773,13 +1780,13 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1773
1780
|
and not self.__is_registered
|
|
1774
1781
|
and not is_demo_dataset
|
|
1775
1782
|
):
|
|
1776
|
-
msg = bundle.get("transform_with_trial_features")
|
|
1783
|
+
msg = self.bundle.get("transform_with_trial_features")
|
|
1777
1784
|
self.logger.warning(msg)
|
|
1778
1785
|
print(msg)
|
|
1779
1786
|
|
|
1780
1787
|
columns_to_drop = [c for c in validated_X.columns if c in self.feature_names_]
|
|
1781
1788
|
if len(columns_to_drop) > 0:
|
|
1782
|
-
msg = bundle.get("x_contains_enriching_columns").format(columns_to_drop)
|
|
1789
|
+
msg = self.bundle.get("x_contains_enriching_columns").format(columns_to_drop)
|
|
1783
1790
|
self.logger.warning(msg)
|
|
1784
1791
|
print(msg)
|
|
1785
1792
|
validated_X = validated_X.drop(columns=columns_to_drop)
|
|
@@ -1796,7 +1803,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1796
1803
|
df = self.__handle_index_search_keys(df, search_keys)
|
|
1797
1804
|
|
|
1798
1805
|
if DEFAULT_INDEX in df.columns:
|
|
1799
|
-
msg = bundle.get("unsupported_index_column")
|
|
1806
|
+
msg = self.bundle.get("unsupported_index_column")
|
|
1800
1807
|
self.logger.info(msg)
|
|
1801
1808
|
print(msg)
|
|
1802
1809
|
df.drop(columns=DEFAULT_INDEX, inplace=True)
|
|
@@ -1909,9 +1916,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1909
1916
|
gc.collect()
|
|
1910
1917
|
|
|
1911
1918
|
if not silent_mode:
|
|
1912
|
-
print(bundle.get("polling_search_task").format(validation_task.search_task_id))
|
|
1919
|
+
print(self.bundle.get("polling_search_task").format(validation_task.search_task_id))
|
|
1913
1920
|
if not self.__is_registered:
|
|
1914
|
-
print(bundle.get("polling_unregister_information"))
|
|
1921
|
+
print(self.bundle.get("polling_unregister_information"))
|
|
1915
1922
|
|
|
1916
1923
|
progress = self.get_progress(trace_id, validation_task)
|
|
1917
1924
|
progress.recalculate_eta(time.time() - start_time)
|
|
@@ -1937,10 +1944,10 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1937
1944
|
time.sleep(polling_period_seconds)
|
|
1938
1945
|
progress = self.get_progress(trace_id, validation_task)
|
|
1939
1946
|
except KeyboardInterrupt as e:
|
|
1940
|
-
print(bundle.get("search_stopping"))
|
|
1947
|
+
print(self.bundle.get("search_stopping"))
|
|
1941
1948
|
self.rest_client.stop_search_task_v2(trace_id, validation_task.search_task_id)
|
|
1942
1949
|
self.logger.warning(f"Search {validation_task.search_task_id} stopped by user")
|
|
1943
|
-
print(bundle.get("search_stopped"))
|
|
1950
|
+
print(self.bundle.get("search_stopped"))
|
|
1944
1951
|
raise e
|
|
1945
1952
|
|
|
1946
1953
|
validation_task.poll_result(trace_id, quiet=True)
|
|
@@ -1962,7 +1969,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1962
1969
|
return res
|
|
1963
1970
|
|
|
1964
1971
|
if not silent_mode:
|
|
1965
|
-
print(bundle.get("transform_start"))
|
|
1972
|
+
print(self.bundle.get("transform_start"))
|
|
1966
1973
|
# with Spinner():
|
|
1967
1974
|
result = enrich()
|
|
1968
1975
|
else:
|
|
@@ -1976,9 +1983,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1976
1983
|
|
|
1977
1984
|
def _get_excluded_features(self, max_features: Optional[int], importance_threshold: Optional[float]) -> List[str]:
|
|
1978
1985
|
features_info = self._internal_features_info
|
|
1979
|
-
comm_schema_header = bundle.get("features_info_commercial_schema")
|
|
1980
|
-
shap_value_header = bundle.get("features_info_shap")
|
|
1981
|
-
feature_name_header = bundle.get("features_info_name")
|
|
1986
|
+
comm_schema_header = self.bundle.get("features_info_commercial_schema")
|
|
1987
|
+
shap_value_header = self.bundle.get("features_info_shap")
|
|
1988
|
+
feature_name_header = self.bundle.get("features_info_name")
|
|
1982
1989
|
external_features = features_info[features_info[comm_schema_header].str.len() > 0]
|
|
1983
1990
|
filtered_features = external_features
|
|
1984
1991
|
if importance_threshold is not None:
|
|
@@ -2009,28 +2016,28 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2009
2016
|
return
|
|
2010
2017
|
else:
|
|
2011
2018
|
self.logger.warning("search_keys not provided")
|
|
2012
|
-
raise ValidationError(bundle.get("empty_search_keys"))
|
|
2019
|
+
raise ValidationError(self.bundle.get("empty_search_keys"))
|
|
2013
2020
|
|
|
2014
2021
|
key_types = search_keys.values()
|
|
2015
2022
|
|
|
2016
2023
|
if SearchKey.DATE in key_types and SearchKey.DATETIME in key_types:
|
|
2017
|
-
msg = bundle.get("date_and_datetime_simultanious")
|
|
2024
|
+
msg = self.bundle.get("date_and_datetime_simultanious")
|
|
2018
2025
|
self.logger.warning(msg)
|
|
2019
2026
|
raise ValidationError(msg)
|
|
2020
2027
|
|
|
2021
2028
|
if SearchKey.EMAIL in key_types and SearchKey.HEM in key_types:
|
|
2022
|
-
msg = bundle.get("email_and_hem_simultanious")
|
|
2029
|
+
msg = self.bundle.get("email_and_hem_simultanious")
|
|
2023
2030
|
self.logger.warning(msg)
|
|
2024
2031
|
raise ValidationError(msg)
|
|
2025
2032
|
|
|
2026
2033
|
if SearchKey.POSTAL_CODE in key_types and SearchKey.COUNTRY not in key_types and self.country_code is None:
|
|
2027
|
-
msg = bundle.get("postal_code_without_country")
|
|
2034
|
+
msg = self.bundle.get("postal_code_without_country")
|
|
2028
2035
|
self.logger.warning(msg)
|
|
2029
2036
|
raise ValidationError(msg)
|
|
2030
2037
|
|
|
2031
2038
|
for key_type in SearchKey.__members__.values():
|
|
2032
2039
|
if key_type != SearchKey.CUSTOM_KEY and list(key_types).count(key_type) > 1:
|
|
2033
|
-
msg = bundle.get("multiple_search_key").format(key_type)
|
|
2040
|
+
msg = self.bundle.get("multiple_search_key").format(key_type)
|
|
2034
2041
|
self.logger.warning(msg)
|
|
2035
2042
|
raise ValidationError(msg)
|
|
2036
2043
|
|
|
@@ -2040,7 +2047,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2040
2047
|
# and not is_demo_dataset
|
|
2041
2048
|
# and len(set(key_types).intersection(non_personal_keys)) == 0
|
|
2042
2049
|
# ):
|
|
2043
|
-
# msg = bundle.get("unregistered_only_personal_keys")
|
|
2050
|
+
# msg = self.bundle.get("unregistered_only_personal_keys")
|
|
2044
2051
|
# self.logger.warning(msg + f" Provided search keys: {key_types}")
|
|
2045
2052
|
# raise ValidationError(msg)
|
|
2046
2053
|
|
|
@@ -2081,7 +2088,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2081
2088
|
)
|
|
2082
2089
|
is_demo_dataset = hash_input(validated_X, validated_y, validated_eval_set) in DEMO_DATASET_HASHES
|
|
2083
2090
|
if is_demo_dataset:
|
|
2084
|
-
msg = bundle.get("demo_dataset_info")
|
|
2091
|
+
msg = self.bundle.get("demo_dataset_info")
|
|
2085
2092
|
self.logger.info(msg)
|
|
2086
2093
|
if not self.__is_registered:
|
|
2087
2094
|
print(msg)
|
|
@@ -2091,7 +2098,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2091
2098
|
checked_generate_features = []
|
|
2092
2099
|
for gen_feature in self.generate_features:
|
|
2093
2100
|
if gen_feature not in x_columns:
|
|
2094
|
-
msg = bundle.get("missing_generate_feature").format(gen_feature, x_columns)
|
|
2101
|
+
msg = self.bundle.get("missing_generate_feature").format(gen_feature, x_columns)
|
|
2095
2102
|
print(msg)
|
|
2096
2103
|
self.logger.warning(msg)
|
|
2097
2104
|
else:
|
|
@@ -2137,7 +2144,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2137
2144
|
df = pd.concat([df, eval_df])
|
|
2138
2145
|
|
|
2139
2146
|
if DEFAULT_INDEX in df.columns:
|
|
2140
|
-
msg = bundle.get("unsupported_index_column")
|
|
2147
|
+
msg = self.bundle.get("unsupported_index_column")
|
|
2141
2148
|
self.logger.info(msg)
|
|
2142
2149
|
print(msg)
|
|
2143
2150
|
self.fit_dropped_features.add(DEFAULT_INDEX)
|
|
@@ -2240,9 +2247,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2240
2247
|
if search_id_callback is not None:
|
|
2241
2248
|
search_id_callback(self._search_task.search_task_id)
|
|
2242
2249
|
|
|
2243
|
-
print(bundle.get("polling_search_task").format(self._search_task.search_task_id))
|
|
2250
|
+
print(self.bundle.get("polling_search_task").format(self._search_task.search_task_id))
|
|
2244
2251
|
if not self.__is_registered:
|
|
2245
|
-
print(bundle.get("polling_unregister_information"))
|
|
2252
|
+
print(self.bundle.get("polling_unregister_information"))
|
|
2246
2253
|
|
|
2247
2254
|
progress = self.get_progress(trace_id)
|
|
2248
2255
|
prev_progress = None
|
|
@@ -2268,14 +2275,14 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2268
2275
|
f"Search {self._search_task.search_task_id} failed with error {progress.error}"
|
|
2269
2276
|
f" and message {progress.error_message}"
|
|
2270
2277
|
)
|
|
2271
|
-
raise RuntimeError(bundle.get("search_task_failed_status"))
|
|
2278
|
+
raise RuntimeError(self.bundle.get("search_task_failed_status"))
|
|
2272
2279
|
time.sleep(poll_period_seconds)
|
|
2273
2280
|
progress = self.get_progress(trace_id)
|
|
2274
2281
|
except KeyboardInterrupt as e:
|
|
2275
|
-
print(bundle.get("search_stopping"))
|
|
2282
|
+
print(self.bundle.get("search_stopping"))
|
|
2276
2283
|
self.rest_client.stop_search_task_v2(trace_id, self._search_task.search_task_id)
|
|
2277
2284
|
self.logger.warning(f"Search {self._search_task.search_task_id} stopped by user")
|
|
2278
|
-
print(bundle.get("search_stopped"))
|
|
2285
|
+
print(self.bundle.get("search_stopped"))
|
|
2279
2286
|
raise e
|
|
2280
2287
|
|
|
2281
2288
|
self._search_task.poll_result(trace_id, quiet=True)
|
|
@@ -2296,7 +2303,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2296
2303
|
)
|
|
2297
2304
|
zero_hit_columns = self.get_columns_by_search_keys(zero_hit_search_keys)
|
|
2298
2305
|
if zero_hit_columns:
|
|
2299
|
-
msg = bundle.get("features_info_zero_hit_rate_search_keys").format(zero_hit_columns)
|
|
2306
|
+
msg = self.bundle.get("features_info_zero_hit_rate_search_keys").format(zero_hit_columns)
|
|
2300
2307
|
self.logger.warning(msg)
|
|
2301
2308
|
self.__display_support_link(msg)
|
|
2302
2309
|
self.warning_counter.increment()
|
|
@@ -2308,7 +2315,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2308
2315
|
unused_features_for_generation = [
|
|
2309
2316
|
dataset.columns_renaming.get(col) or col for col in self._search_task.unused_features_for_generation
|
|
2310
2317
|
]
|
|
2311
|
-
msg = bundle.get("features_not_generated").format(unused_features_for_generation)
|
|
2318
|
+
msg = self.bundle.get("features_not_generated").format(unused_features_for_generation)
|
|
2312
2319
|
self.logger.warning(msg)
|
|
2313
2320
|
print(msg)
|
|
2314
2321
|
self.warning_counter.increment()
|
|
@@ -2323,7 +2330,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2323
2330
|
|
|
2324
2331
|
if self._has_paid_features(exclude_features_sources):
|
|
2325
2332
|
if calculate_metrics is not None and calculate_metrics:
|
|
2326
|
-
msg = bundle.get("metrics_with_paid_features")
|
|
2333
|
+
msg = self.bundle.get("metrics_with_paid_features")
|
|
2327
2334
|
self.logger.warning(msg)
|
|
2328
2335
|
self.__display_support_link(msg)
|
|
2329
2336
|
else:
|
|
@@ -2334,7 +2341,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2334
2341
|
if len(validated_X) < self.CALCULATE_METRICS_MIN_THRESHOLD or any(
|
|
2335
2342
|
[len(eval_X) < self.CALCULATE_METRICS_MIN_THRESHOLD for eval_X, _ in validated_eval_set]
|
|
2336
2343
|
):
|
|
2337
|
-
msg = bundle.get("too_small_for_metrics")
|
|
2344
|
+
msg = self.bundle.get("too_small_for_metrics")
|
|
2338
2345
|
self.logger.warning(msg)
|
|
2339
2346
|
calculate_metrics = False
|
|
2340
2347
|
elif len(dataset) * len(dataset.columns) > self.CALCULATE_METRICS_THRESHOLD:
|
|
@@ -2365,7 +2372,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2365
2372
|
self.__show_report_button()
|
|
2366
2373
|
|
|
2367
2374
|
if not self.warning_counter.has_warnings():
|
|
2368
|
-
self.__display_support_link(bundle.get("all_ok_community_invite"))
|
|
2375
|
+
self.__display_support_link(self.bundle.get("all_ok_community_invite"))
|
|
2369
2376
|
|
|
2370
2377
|
def __adjust_cv(self, df: pd.DataFrame, date_column: pd.Series, model_task_type: ModelTaskType):
|
|
2371
2378
|
# Check Multivariate time series
|
|
@@ -2376,14 +2383,14 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2376
2383
|
and len({SearchKey.PHONE, SearchKey.EMAIL, SearchKey.HEM}.intersection(self.fit_search_keys.keys())) == 0
|
|
2377
2384
|
and is_blocked_time_series(df, date_column, list(self.fit_search_keys.keys()) + [TARGET])
|
|
2378
2385
|
):
|
|
2379
|
-
msg = bundle.get("multivariate_timeseries_detected")
|
|
2386
|
+
msg = self.bundle.get("multivariate_timeseries_detected")
|
|
2380
2387
|
self.__override_cv(CVType.blocked_time_series, msg, print_warning=False)
|
|
2381
2388
|
elif (
|
|
2382
2389
|
self.cv is None
|
|
2383
2390
|
and model_task_type != ModelTaskType.REGRESSION
|
|
2384
2391
|
and self._get_group_columns(df, self.fit_search_keys)
|
|
2385
2392
|
):
|
|
2386
|
-
msg = bundle.get("group_k_fold_in_classification")
|
|
2393
|
+
msg = self.bundle.get("group_k_fold_in_classification")
|
|
2387
2394
|
self.__override_cv(CVType.group_k_fold, msg, print_warning=self.cv is not None)
|
|
2388
2395
|
|
|
2389
2396
|
def __override_cv(self, cv: CVType, msg: str, print_warning: bool = True):
|
|
@@ -2403,11 +2410,11 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2403
2410
|
|
|
2404
2411
|
def _validate_X(self, X, is_transform=False) -> pd.DataFrame:
|
|
2405
2412
|
if _num_samples(X) == 0:
|
|
2406
|
-
raise ValidationError(bundle.get("x_is_empty"))
|
|
2413
|
+
raise ValidationError(self.bundle.get("x_is_empty"))
|
|
2407
2414
|
|
|
2408
2415
|
if isinstance(X, pd.DataFrame):
|
|
2409
2416
|
if isinstance(X.columns, pd.MultiIndex) or isinstance(X.index, pd.MultiIndex):
|
|
2410
|
-
raise ValidationError(bundle.get("x_multiindex_unsupported"))
|
|
2417
|
+
raise ValidationError(self.bundle.get("x_multiindex_unsupported"))
|
|
2411
2418
|
validated_X = X.copy()
|
|
2412
2419
|
elif isinstance(X, pd.Series):
|
|
2413
2420
|
validated_X = X.to_frame()
|
|
@@ -2416,12 +2423,12 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2416
2423
|
renaming = {c: str(c) for c in validated_X.columns}
|
|
2417
2424
|
validated_X = validated_X.rename(columns=renaming)
|
|
2418
2425
|
else:
|
|
2419
|
-
raise ValidationError(bundle.get("unsupported_x_type").format(type(X)))
|
|
2426
|
+
raise ValidationError(self.bundle.get("unsupported_x_type").format(type(X)))
|
|
2420
2427
|
|
|
2421
2428
|
if len(set(validated_X.columns)) != len(validated_X.columns):
|
|
2422
|
-
raise ValidationError(bundle.get("x_contains_dup_columns"))
|
|
2429
|
+
raise ValidationError(self.bundle.get("x_contains_dup_columns"))
|
|
2423
2430
|
if not is_transform and not validated_X.index.is_unique:
|
|
2424
|
-
raise ValidationError(bundle.get("x_non_unique_index"))
|
|
2431
|
+
raise ValidationError(self.bundle.get("x_non_unique_index"))
|
|
2425
2432
|
|
|
2426
2433
|
if self.exclude_columns is not None:
|
|
2427
2434
|
validated_X = validated_X.drop(columns=self.exclude_columns, errors="ignore")
|
|
@@ -2432,17 +2439,17 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2432
2439
|
)
|
|
2433
2440
|
|
|
2434
2441
|
if TARGET in validated_X.columns:
|
|
2435
|
-
raise ValidationError(bundle.get("x_contains_reserved_column_name").format(TARGET))
|
|
2442
|
+
raise ValidationError(self.bundle.get("x_contains_reserved_column_name").format(TARGET))
|
|
2436
2443
|
if not is_transform and EVAL_SET_INDEX in validated_X.columns:
|
|
2437
|
-
raise ValidationError(bundle.get("x_contains_reserved_column_name").format(EVAL_SET_INDEX))
|
|
2444
|
+
raise ValidationError(self.bundle.get("x_contains_reserved_column_name").format(EVAL_SET_INDEX))
|
|
2438
2445
|
if SYSTEM_RECORD_ID in validated_X.columns:
|
|
2439
|
-
raise ValidationError(bundle.get("x_contains_reserved_column_name").format(SYSTEM_RECORD_ID))
|
|
2446
|
+
raise ValidationError(self.bundle.get("x_contains_reserved_column_name").format(SYSTEM_RECORD_ID))
|
|
2440
2447
|
|
|
2441
2448
|
return validated_X
|
|
2442
2449
|
|
|
2443
2450
|
def _validate_y(self, X: pd.DataFrame, y) -> pd.Series:
|
|
2444
2451
|
if _num_samples(y) == 0:
|
|
2445
|
-
raise ValidationError(bundle.get("y_is_empty"))
|
|
2452
|
+
raise ValidationError(self.bundle.get("y_is_empty"))
|
|
2446
2453
|
|
|
2447
2454
|
if (
|
|
2448
2455
|
not isinstance(y, pd.Series)
|
|
@@ -2450,26 +2457,26 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2450
2457
|
and not isinstance(y, np.ndarray)
|
|
2451
2458
|
and not isinstance(y, list)
|
|
2452
2459
|
):
|
|
2453
|
-
raise ValidationError(bundle.get("unsupported_y_type").format(type(y)))
|
|
2460
|
+
raise ValidationError(self.bundle.get("unsupported_y_type").format(type(y)))
|
|
2454
2461
|
|
|
2455
2462
|
if _num_samples(X) != _num_samples(y):
|
|
2456
|
-
raise ValidationError(bundle.get("x_and_y_diff_size").format(_num_samples(X), _num_samples(y)))
|
|
2463
|
+
raise ValidationError(self.bundle.get("x_and_y_diff_size").format(_num_samples(X), _num_samples(y)))
|
|
2457
2464
|
|
|
2458
2465
|
if isinstance(y, pd.DataFrame):
|
|
2459
2466
|
if len(y.columns) != 1:
|
|
2460
|
-
raise ValidationError(bundle.get("y_invalid_dimension_dataframe"))
|
|
2467
|
+
raise ValidationError(self.bundle.get("y_invalid_dimension_dataframe"))
|
|
2461
2468
|
if isinstance(y.columns, pd.MultiIndex) or isinstance(y.index, pd.MultiIndex):
|
|
2462
|
-
raise ValidationError(bundle.get("y_multiindex_unsupported"))
|
|
2469
|
+
raise ValidationError(self.bundle.get("y_multiindex_unsupported"))
|
|
2463
2470
|
y = y[y.columns[0]]
|
|
2464
2471
|
|
|
2465
2472
|
if isinstance(y, pd.Series):
|
|
2466
2473
|
if (y.index != X.index).any():
|
|
2467
|
-
raise ValidationError(bundle.get("x_and_y_diff_index"))
|
|
2474
|
+
raise ValidationError(self.bundle.get("x_and_y_diff_index"))
|
|
2468
2475
|
validated_y = y.copy()
|
|
2469
2476
|
validated_y.rename(TARGET, inplace=True)
|
|
2470
2477
|
elif isinstance(y, np.ndarray):
|
|
2471
2478
|
if y.ndim != 1:
|
|
2472
|
-
raise ValidationError(bundle.get("y_invalid_dimension_array"))
|
|
2479
|
+
raise ValidationError(self.bundle.get("y_invalid_dimension_array"))
|
|
2473
2480
|
Xy = X.copy()
|
|
2474
2481
|
Xy[TARGET] = y
|
|
2475
2482
|
validated_y = Xy[TARGET].copy()
|
|
@@ -2479,24 +2486,24 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2479
2486
|
validated_y = Xy[TARGET].copy()
|
|
2480
2487
|
|
|
2481
2488
|
if validated_y.nunique() < 2:
|
|
2482
|
-
raise ValidationError(bundle.get("y_is_constant"))
|
|
2489
|
+
raise ValidationError(self.bundle.get("y_is_constant"))
|
|
2483
2490
|
|
|
2484
2491
|
return validated_y
|
|
2485
2492
|
|
|
2486
2493
|
def _validate_eval_set_pair(self, X: pd.DataFrame, eval_pair: Tuple) -> Tuple[pd.DataFrame, pd.Series]:
|
|
2487
2494
|
if len(eval_pair) != 2:
|
|
2488
|
-
raise ValidationError(bundle.get("eval_set_invalid_tuple_size").format(len(eval_pair)))
|
|
2495
|
+
raise ValidationError(self.bundle.get("eval_set_invalid_tuple_size").format(len(eval_pair)))
|
|
2489
2496
|
eval_X = eval_pair[0]
|
|
2490
2497
|
eval_y = eval_pair[1]
|
|
2491
2498
|
|
|
2492
2499
|
if _num_samples(eval_X) == 0:
|
|
2493
|
-
raise ValidationError(bundle.get("eval_x_is_empty"))
|
|
2500
|
+
raise ValidationError(self.bundle.get("eval_x_is_empty"))
|
|
2494
2501
|
if _num_samples(eval_y) == 0:
|
|
2495
|
-
raise ValidationError(bundle.get("eval_y_is_empty"))
|
|
2502
|
+
raise ValidationError(self.bundle.get("eval_y_is_empty"))
|
|
2496
2503
|
|
|
2497
2504
|
if isinstance(eval_X, pd.DataFrame):
|
|
2498
2505
|
if isinstance(eval_X.columns, pd.MultiIndex) or isinstance(eval_X.index, pd.MultiIndex):
|
|
2499
|
-
raise ValidationError(bundle.get("eval_x_multiindex_unsupported"))
|
|
2506
|
+
raise ValidationError(self.bundle.get("eval_x_multiindex_unsupported"))
|
|
2500
2507
|
validated_eval_X = eval_X.copy()
|
|
2501
2508
|
elif isinstance(eval_X, pd.Series):
|
|
2502
2509
|
validated_eval_X = eval_X.to_frame()
|
|
@@ -2505,10 +2512,10 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2505
2512
|
renaming = {c: str(c) for c in validated_eval_X.columns}
|
|
2506
2513
|
validated_eval_X = validated_eval_X.rename(columns=renaming)
|
|
2507
2514
|
else:
|
|
2508
|
-
raise ValidationError(bundle.get("unsupported_x_type_eval_set").format(type(eval_X)))
|
|
2515
|
+
raise ValidationError(self.bundle.get("unsupported_x_type_eval_set").format(type(eval_X)))
|
|
2509
2516
|
|
|
2510
2517
|
if not validated_eval_X.index.is_unique:
|
|
2511
|
-
raise ValidationError(bundle.get("x_non_unique_index_eval_set"))
|
|
2518
|
+
raise ValidationError(self.bundle.get("x_non_unique_index_eval_set"))
|
|
2512
2519
|
|
|
2513
2520
|
if self.exclude_columns is not None:
|
|
2514
2521
|
validated_eval_X = validated_eval_X.drop(columns=self.exclude_columns, errors="ignore")
|
|
@@ -2522,28 +2529,30 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2522
2529
|
if set(validated_eval_X.columns.to_list()) == set(X.columns.to_list()):
|
|
2523
2530
|
validated_eval_X = validated_eval_X[X.columns.to_list()]
|
|
2524
2531
|
else:
|
|
2525
|
-
raise ValidationError(bundle.get("eval_x_and_x_diff_shape"))
|
|
2532
|
+
raise ValidationError(self.bundle.get("eval_x_and_x_diff_shape"))
|
|
2526
2533
|
|
|
2527
2534
|
if _num_samples(validated_eval_X) != _num_samples(eval_y):
|
|
2528
2535
|
raise ValidationError(
|
|
2529
|
-
bundle.get("x_and_y_diff_size_eval_set").format(
|
|
2536
|
+
self.bundle.get("x_and_y_diff_size_eval_set").format(
|
|
2537
|
+
_num_samples(validated_eval_X), _num_samples(eval_y)
|
|
2538
|
+
)
|
|
2530
2539
|
)
|
|
2531
2540
|
|
|
2532
2541
|
if isinstance(eval_y, pd.DataFrame):
|
|
2533
2542
|
if len(eval_y.columns) != 1:
|
|
2534
|
-
raise ValidationError(bundle.get("y_invalid_dimension_dataframe_eval_set"))
|
|
2543
|
+
raise ValidationError(self.bundle.get("y_invalid_dimension_dataframe_eval_set"))
|
|
2535
2544
|
if isinstance(eval_y.columns, pd.MultiIndex) or isinstance(eval_y.index, pd.MultiIndex):
|
|
2536
|
-
raise ValidationError(bundle.get("eval_y_multiindex_unsupported"))
|
|
2545
|
+
raise ValidationError(self.bundle.get("eval_y_multiindex_unsupported"))
|
|
2537
2546
|
eval_y = eval_y[eval_y.columns[0]]
|
|
2538
2547
|
|
|
2539
2548
|
if isinstance(eval_y, pd.Series):
|
|
2540
2549
|
if (eval_y.index != validated_eval_X.index).any():
|
|
2541
|
-
raise ValidationError(bundle.get("x_and_y_diff_index_eval_set"))
|
|
2550
|
+
raise ValidationError(self.bundle.get("x_and_y_diff_index_eval_set"))
|
|
2542
2551
|
validated_eval_y = eval_y.copy()
|
|
2543
2552
|
validated_eval_y.rename(TARGET, inplace=True)
|
|
2544
2553
|
elif isinstance(eval_y, np.ndarray):
|
|
2545
2554
|
if eval_y.ndim != 1:
|
|
2546
|
-
raise ValidationError(bundle.get("y_invalid_dimension_array_eval_set"))
|
|
2555
|
+
raise ValidationError(self.bundle.get("y_invalid_dimension_array_eval_set"))
|
|
2547
2556
|
Xy = validated_eval_X.copy()
|
|
2548
2557
|
Xy[TARGET] = eval_y
|
|
2549
2558
|
validated_eval_y = Xy[TARGET].copy()
|
|
@@ -2552,27 +2561,29 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2552
2561
|
Xy[TARGET] = eval_y
|
|
2553
2562
|
validated_eval_y = Xy[TARGET].copy()
|
|
2554
2563
|
else:
|
|
2555
|
-
raise ValidationError(bundle.get("unsupported_y_type_eval_set").format(type(eval_y)))
|
|
2564
|
+
raise ValidationError(self.bundle.get("unsupported_y_type_eval_set").format(type(eval_y)))
|
|
2556
2565
|
|
|
2557
2566
|
if validated_eval_y.nunique() < 2:
|
|
2558
|
-
raise ValidationError(bundle.get("y_is_constant_eval_set"))
|
|
2567
|
+
raise ValidationError(self.bundle.get("y_is_constant_eval_set"))
|
|
2559
2568
|
|
|
2560
2569
|
return validated_eval_X, validated_eval_y
|
|
2561
2570
|
|
|
2562
2571
|
def _validate_baseline_score(self, X: pd.DataFrame, eval_set: Optional[List[Tuple]]):
|
|
2563
2572
|
if self.baseline_score_column is not None:
|
|
2564
2573
|
if self.baseline_score_column not in X.columns:
|
|
2565
|
-
raise ValidationError(
|
|
2574
|
+
raise ValidationError(
|
|
2575
|
+
self.bundle.get("baseline_score_column_not_exists").format(self.baseline_score_column)
|
|
2576
|
+
)
|
|
2566
2577
|
if X[self.baseline_score_column].isna().any():
|
|
2567
|
-
raise ValidationError(bundle.get("baseline_score_column_has_na"))
|
|
2578
|
+
raise ValidationError(self.bundle.get("baseline_score_column_has_na"))
|
|
2568
2579
|
if eval_set is not None:
|
|
2569
2580
|
if isinstance(eval_set, tuple):
|
|
2570
2581
|
eval_set = [eval_set]
|
|
2571
2582
|
for eval in eval_set:
|
|
2572
2583
|
if self.baseline_score_column not in eval[0].columns:
|
|
2573
|
-
raise ValidationError(bundle.get("baseline_score_column_not_exists"))
|
|
2584
|
+
raise ValidationError(self.bundle.get("baseline_score_column_not_exists"))
|
|
2574
2585
|
if eval[0][self.baseline_score_column].isna().any():
|
|
2575
|
-
raise ValidationError(bundle.get("baseline_score_column_has_na"))
|
|
2586
|
+
raise ValidationError(self.bundle.get("baseline_score_column_has_na"))
|
|
2576
2587
|
|
|
2577
2588
|
@staticmethod
|
|
2578
2589
|
def _sample_X_and_y(X: pd.DataFrame, y: pd.Series, enriched_X: pd.DataFrame) -> Tuple[pd.DataFrame, pd.Series]:
|
|
@@ -2856,7 +2867,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2856
2867
|
) -> Tuple[pd.DataFrame, Dict[int, pd.DataFrame]]:
|
|
2857
2868
|
if result_features is None:
|
|
2858
2869
|
self.logger.error(f"result features not found by search_task_id: {self.get_search_id()}")
|
|
2859
|
-
raise RuntimeError(bundle.get("features_wasnt_returned"))
|
|
2870
|
+
raise RuntimeError(self.bundle.get("features_wasnt_returned"))
|
|
2860
2871
|
result_features = (
|
|
2861
2872
|
result_features.drop(columns=EVAL_SET_INDEX)
|
|
2862
2873
|
if EVAL_SET_INDEX in result_features.columns
|
|
@@ -2867,7 +2878,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2867
2878
|
dup_features = [c for c in comparing_columns if c in result_features.columns and c != SYSTEM_RECORD_ID]
|
|
2868
2879
|
if len(dup_features) > 0:
|
|
2869
2880
|
self.logger.warning(f"X contain columns with same name as returned from backend: {dup_features}")
|
|
2870
|
-
raise ValidationError(bundle.get("returned_features_same_as_passed").format(dup_features))
|
|
2881
|
+
raise ValidationError(self.bundle.get("returned_features_same_as_passed").format(dup_features))
|
|
2871
2882
|
|
|
2872
2883
|
# index overrites from result_features
|
|
2873
2884
|
original_index_name = df_with_original_index.index.name
|
|
@@ -2927,10 +2938,10 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2927
2938
|
|
|
2928
2939
|
def __prepare_feature_importances(self, trace_id: str, x_columns: List[str]):
|
|
2929
2940
|
if self._search_task is None:
|
|
2930
|
-
raise NotFittedError(bundle.get("transform_unfitted_enricher"))
|
|
2941
|
+
raise NotFittedError(self.bundle.get("transform_unfitted_enricher"))
|
|
2931
2942
|
features_meta = self._search_task.get_all_features_metadata_v2()
|
|
2932
2943
|
if features_meta is None:
|
|
2933
|
-
raise Exception(bundle.get("missing_features_meta"))
|
|
2944
|
+
raise Exception(self.bundle.get("missing_features_meta"))
|
|
2934
2945
|
|
|
2935
2946
|
original_names_dict = {c.name: c.originalName for c in self._search_task.get_file_metadata(trace_id).columns}
|
|
2936
2947
|
features_df = self._search_task.get_all_initial_raw_features(trace_id, metrics_calculation=True)
|
|
@@ -3020,38 +3031,38 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3020
3031
|
)
|
|
3021
3032
|
features_info.append(
|
|
3022
3033
|
{
|
|
3023
|
-
bundle.get("features_info_name"): feature_name,
|
|
3024
|
-
bundle.get("features_info_shap"): round_shap_value(feature_meta.shap_value),
|
|
3025
|
-
bundle.get("features_info_hitrate"): feature_meta.hit_rate,
|
|
3026
|
-
bundle.get("features_info_value_preview"): feature_sample,
|
|
3027
|
-
bundle.get("features_info_provider"): provider,
|
|
3028
|
-
bundle.get("features_info_source"): source,
|
|
3029
|
-
bundle.get("features_info_commercial_schema"): commercial_schema,
|
|
3034
|
+
self.bundle.get("features_info_name"): feature_name,
|
|
3035
|
+
self.bundle.get("features_info_shap"): round_shap_value(feature_meta.shap_value),
|
|
3036
|
+
self.bundle.get("features_info_hitrate"): feature_meta.hit_rate,
|
|
3037
|
+
self.bundle.get("features_info_value_preview"): feature_sample,
|
|
3038
|
+
self.bundle.get("features_info_provider"): provider,
|
|
3039
|
+
self.bundle.get("features_info_source"): source,
|
|
3040
|
+
self.bundle.get("features_info_commercial_schema"): commercial_schema,
|
|
3030
3041
|
}
|
|
3031
3042
|
)
|
|
3032
3043
|
features_info_without_links.append(
|
|
3033
3044
|
{
|
|
3034
|
-
bundle.get("features_info_name"): internal_feature_name,
|
|
3035
|
-
bundle.get("features_info_shap"): round_shap_value(feature_meta.shap_value),
|
|
3036
|
-
bundle.get("features_info_hitrate"): feature_meta.hit_rate,
|
|
3037
|
-
bundle.get("features_info_value_preview"): feature_sample,
|
|
3038
|
-
bundle.get("features_info_provider"): internal_provider,
|
|
3039
|
-
bundle.get("features_info_source"): internal_source,
|
|
3040
|
-
bundle.get("features_info_commercial_schema"): commercial_schema,
|
|
3045
|
+
self.bundle.get("features_info_name"): internal_feature_name,
|
|
3046
|
+
self.bundle.get("features_info_shap"): round_shap_value(feature_meta.shap_value),
|
|
3047
|
+
self.bundle.get("features_info_hitrate"): feature_meta.hit_rate,
|
|
3048
|
+
self.bundle.get("features_info_value_preview"): feature_sample,
|
|
3049
|
+
self.bundle.get("features_info_provider"): internal_provider,
|
|
3050
|
+
self.bundle.get("features_info_source"): internal_source,
|
|
3051
|
+
self.bundle.get("features_info_commercial_schema"): commercial_schema,
|
|
3041
3052
|
}
|
|
3042
3053
|
)
|
|
3043
3054
|
internal_features_info.append(
|
|
3044
3055
|
{
|
|
3045
|
-
bundle.get("features_info_name"): internal_feature_name,
|
|
3056
|
+
self.bundle.get("features_info_name"): internal_feature_name,
|
|
3046
3057
|
"feature_link": feature_meta.doc_link,
|
|
3047
|
-
bundle.get("features_info_shap"): round_shap_value(feature_meta.shap_value),
|
|
3048
|
-
bundle.get("features_info_hitrate"): feature_meta.hit_rate,
|
|
3049
|
-
bundle.get("features_info_value_preview"): feature_sample,
|
|
3050
|
-
bundle.get("features_info_provider"): internal_provider,
|
|
3058
|
+
self.bundle.get("features_info_shap"): round_shap_value(feature_meta.shap_value),
|
|
3059
|
+
self.bundle.get("features_info_hitrate"): feature_meta.hit_rate,
|
|
3060
|
+
self.bundle.get("features_info_value_preview"): feature_sample,
|
|
3061
|
+
self.bundle.get("features_info_provider"): internal_provider,
|
|
3051
3062
|
"provider_link": feature_meta.data_provider_link,
|
|
3052
|
-
bundle.get("features_info_source"): internal_source,
|
|
3063
|
+
self.bundle.get("features_info_source"): internal_source,
|
|
3053
3064
|
"source_link": feature_meta.data_source_link,
|
|
3054
|
-
bundle.get("features_info_commercial_schema"): feature_meta.commercial_schema or "",
|
|
3065
|
+
self.bundle.get("features_info_commercial_schema"): feature_meta.commercial_schema or "",
|
|
3055
3066
|
}
|
|
3056
3067
|
)
|
|
3057
3068
|
|
|
@@ -3061,8 +3072,10 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3061
3072
|
self._internal_features_info = pd.DataFrame(internal_features_info)
|
|
3062
3073
|
do_without_pandas_limits(lambda: self.logger.info(f"Features info:\n{self._internal_features_info}"))
|
|
3063
3074
|
|
|
3064
|
-
self.relevant_data_sources = self._group_relevant_data_sources(self.features_info)
|
|
3065
|
-
self._relevant_data_sources_wo_links = self._group_relevant_data_sources(
|
|
3075
|
+
self.relevant_data_sources = self._group_relevant_data_sources(self.features_info, self.bundle)
|
|
3076
|
+
self._relevant_data_sources_wo_links = self._group_relevant_data_sources(
|
|
3077
|
+
self._features_info_without_links, self.bundle
|
|
3078
|
+
)
|
|
3066
3079
|
do_without_pandas_limits(
|
|
3067
3080
|
lambda: self.logger.info(f"Relevant data sources:\n{self._relevant_data_sources_wo_links}")
|
|
3068
3081
|
)
|
|
@@ -3122,7 +3135,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3122
3135
|
return None
|
|
3123
3136
|
|
|
3124
3137
|
@staticmethod
|
|
3125
|
-
def _group_relevant_data_sources(df: pd.DataFrame) -> pd.DataFrame:
|
|
3138
|
+
def _group_relevant_data_sources(df: pd.DataFrame, bundle: ResourceBundle) -> pd.DataFrame:
|
|
3126
3139
|
return (
|
|
3127
3140
|
df.query(f"{bundle.get('features_info_provider')} != ''")
|
|
3128
3141
|
.groupby([bundle.get("features_info_provider"), bundle.get("features_info_source")])
|
|
@@ -3177,31 +3190,31 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3177
3190
|
}
|
|
3178
3191
|
passed_unsupported_search_keys = unsupported_search_keys.intersection(search_keys.values())
|
|
3179
3192
|
if len(passed_unsupported_search_keys) > 0:
|
|
3180
|
-
raise ValidationError(bundle.get("unsupported_search_key").format(passed_unsupported_search_keys))
|
|
3193
|
+
raise ValidationError(self.bundle.get("unsupported_search_key").format(passed_unsupported_search_keys))
|
|
3181
3194
|
|
|
3182
3195
|
for column_id, meaning_type in search_keys.items():
|
|
3183
3196
|
column_name = None
|
|
3184
3197
|
if isinstance(column_id, str):
|
|
3185
3198
|
if column_id not in x.columns:
|
|
3186
|
-
raise ValidationError(bundle.get("search_key_not_found").format(column_id, list(x.columns)))
|
|
3199
|
+
raise ValidationError(self.bundle.get("search_key_not_found").format(column_id, list(x.columns)))
|
|
3187
3200
|
column_name = column_id
|
|
3188
3201
|
valid_search_keys[column_name] = meaning_type
|
|
3189
3202
|
elif isinstance(column_id, int):
|
|
3190
3203
|
if column_id >= x.shape[1]:
|
|
3191
|
-
raise ValidationError(bundle.get("numeric_search_key_not_found").format(column_id, x.shape[1]))
|
|
3204
|
+
raise ValidationError(self.bundle.get("numeric_search_key_not_found").format(column_id, x.shape[1]))
|
|
3192
3205
|
column_name = x.columns[column_id]
|
|
3193
3206
|
valid_search_keys[column_name] = meaning_type
|
|
3194
3207
|
else:
|
|
3195
|
-
raise ValidationError(bundle.get("unsupported_search_key_type").format(type(column_id)))
|
|
3208
|
+
raise ValidationError(self.bundle.get("unsupported_search_key_type").format(type(column_id)))
|
|
3196
3209
|
|
|
3197
3210
|
if meaning_type == SearchKey.COUNTRY and self.country_code is not None:
|
|
3198
|
-
msg = bundle.get("search_key_country_and_country_code")
|
|
3211
|
+
msg = self.bundle.get("search_key_country_and_country_code")
|
|
3199
3212
|
self.logger.warning(msg)
|
|
3200
3213
|
print(msg)
|
|
3201
3214
|
self.country_code = None
|
|
3202
3215
|
|
|
3203
3216
|
if not self.__is_registered and not is_demo_dataset and meaning_type in SearchKey.personal_keys():
|
|
3204
|
-
msg = bundle.get("unregistered_with_personal_keys").format(meaning_type)
|
|
3217
|
+
msg = self.bundle.get("unregistered_with_personal_keys").format(meaning_type)
|
|
3205
3218
|
self.logger.warning(msg)
|
|
3206
3219
|
if not silent_mode:
|
|
3207
3220
|
self.warning_counter.increment()
|
|
@@ -3212,7 +3225,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3212
3225
|
if x[column_name].isnull().all() or (
|
|
3213
3226
|
is_string_dtype(x[column_name]) and (x[column_name].astype("string").str.strip() == "").all()
|
|
3214
3227
|
):
|
|
3215
|
-
raise ValidationError(bundle.get("empty_search_key").format(column_name))
|
|
3228
|
+
raise ValidationError(self.bundle.get("empty_search_key").format(column_name))
|
|
3216
3229
|
|
|
3217
3230
|
if self.detect_missing_search_keys and (
|
|
3218
3231
|
not is_transform or set(valid_search_keys.values()) != set(self.fit_search_keys.values())
|
|
@@ -3222,7 +3235,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3222
3235
|
)
|
|
3223
3236
|
|
|
3224
3237
|
if all(k == SearchKey.CUSTOM_KEY for k in valid_search_keys.values()):
|
|
3225
|
-
msg = bundle.get("unregistered_only_personal_keys")
|
|
3238
|
+
msg = self.bundle.get("unregistered_only_personal_keys")
|
|
3226
3239
|
self.logger.warning(msg + f" Provided search keys: {search_keys}")
|
|
3227
3240
|
raise ValidationError(msg)
|
|
3228
3241
|
|
|
@@ -3237,7 +3250,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3237
3250
|
and next(iter(valid_search_keys.values())) == SearchKey.DATE
|
|
3238
3251
|
and not silent_mode
|
|
3239
3252
|
):
|
|
3240
|
-
msg = bundle.get("date_only_search")
|
|
3253
|
+
msg = self.bundle.get("date_only_search")
|
|
3241
3254
|
print(msg)
|
|
3242
3255
|
self.logger.warning(msg)
|
|
3243
3256
|
self.warning_counter.increment()
|
|
@@ -3246,7 +3259,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3246
3259
|
if (self.cv is None or self.cv == CVType.k_fold) and len(maybe_date) > 0 and not silent_mode:
|
|
3247
3260
|
date_column = next(iter(maybe_date))
|
|
3248
3261
|
if x[date_column].nunique() > 0.9 * _num_samples(x):
|
|
3249
|
-
msg = bundle.get("date_search_without_time_series")
|
|
3262
|
+
msg = self.bundle.get("date_search_without_time_series")
|
|
3250
3263
|
print(msg)
|
|
3251
3264
|
self.logger.warning(msg)
|
|
3252
3265
|
self.warning_counter.increment()
|
|
@@ -3255,7 +3268,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3255
3268
|
for k, v in valid_search_keys.items():
|
|
3256
3269
|
# Show warning for country only if country is the only key
|
|
3257
3270
|
if x[k].nunique() == 1 and (v != SearchKey.COUNTRY or len(valid_search_keys) == 1):
|
|
3258
|
-
msg = bundle.get("single_constant_search_key").format(v, x[k].values[0])
|
|
3271
|
+
msg = self.bundle.get("single_constant_search_key").format(v, x[k].values[0])
|
|
3259
3272
|
print(msg)
|
|
3260
3273
|
self.logger.warning(msg)
|
|
3261
3274
|
self.warning_counter.increment()
|
|
@@ -3287,11 +3300,11 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3287
3300
|
progress_callback=progress_callback,
|
|
3288
3301
|
)
|
|
3289
3302
|
if self.metrics is not None:
|
|
3290
|
-
msg = bundle.get("quality_metrics_header")
|
|
3303
|
+
msg = self.bundle.get("quality_metrics_header")
|
|
3291
3304
|
display_html_dataframe(self.metrics, self.metrics, msg)
|
|
3292
3305
|
|
|
3293
3306
|
def __show_selected_features(self, search_keys: Dict[str, SearchKey]):
|
|
3294
|
-
msg = bundle.get("features_info_header").format(len(self.feature_names_), list(search_keys.keys()))
|
|
3307
|
+
msg = self.bundle.get("features_info_header").format(len(self.feature_names_), list(search_keys.keys()))
|
|
3295
3308
|
|
|
3296
3309
|
try:
|
|
3297
3310
|
_ = get_ipython() # type: ignore
|
|
@@ -3300,16 +3313,16 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3300
3313
|
self.logger.info(msg)
|
|
3301
3314
|
if len(self.feature_names_) > 0:
|
|
3302
3315
|
display_html_dataframe(
|
|
3303
|
-
self.features_info, self._features_info_without_links, bundle.get("relevant_features_header")
|
|
3316
|
+
self.features_info, self._features_info_without_links, self.bundle.get("relevant_features_header")
|
|
3304
3317
|
)
|
|
3305
3318
|
|
|
3306
3319
|
display_html_dataframe(
|
|
3307
3320
|
self.relevant_data_sources,
|
|
3308
3321
|
self._relevant_data_sources_wo_links,
|
|
3309
|
-
bundle.get("relevant_data_sources_header"),
|
|
3322
|
+
self.bundle.get("relevant_data_sources_header"),
|
|
3310
3323
|
)
|
|
3311
3324
|
else:
|
|
3312
|
-
msg = bundle.get("features_info_zero_important_features")
|
|
3325
|
+
msg = self.bundle.get("features_info_zero_important_features")
|
|
3313
3326
|
self.logger.warning(msg)
|
|
3314
3327
|
self.__display_support_link(msg)
|
|
3315
3328
|
self.warning_counter.increment()
|
|
@@ -3336,14 +3349,14 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3336
3349
|
return float(importance_threshold) if importance_threshold is not None else 0.0
|
|
3337
3350
|
except ValueError:
|
|
3338
3351
|
self.logger.exception(f"Invalid importance_threshold provided: {importance_threshold}")
|
|
3339
|
-
raise ValidationError(bundle.get("invalid_importance_threshold"))
|
|
3352
|
+
raise ValidationError(self.bundle.get("invalid_importance_threshold"))
|
|
3340
3353
|
|
|
3341
3354
|
def __validate_max_features(self, max_features: Optional[int]) -> int:
|
|
3342
3355
|
try:
|
|
3343
3356
|
return int(max_features) if max_features is not None else 400
|
|
3344
3357
|
except ValueError:
|
|
3345
3358
|
self.logger.exception(f"Invalid max_features provided: {max_features}")
|
|
3346
|
-
raise ValidationError(bundle.get("invalid_max_features"))
|
|
3359
|
+
raise ValidationError(self.bundle.get("invalid_max_features"))
|
|
3347
3360
|
|
|
3348
3361
|
def __filtered_enriched_features(
|
|
3349
3362
|
self,
|
|
@@ -3375,7 +3388,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3375
3388
|
self.autodetected_search_keys[maybe_key] = SearchKey.POSTAL_CODE
|
|
3376
3389
|
self.logger.info(f"Autodetected search key POSTAL_CODE in column {maybe_key}")
|
|
3377
3390
|
if not silent_mode:
|
|
3378
|
-
print(bundle.get("postal_code_detected").format(maybe_key))
|
|
3391
|
+
print(self.bundle.get("postal_code_detected").format(maybe_key))
|
|
3379
3392
|
|
|
3380
3393
|
if (
|
|
3381
3394
|
SearchKey.COUNTRY not in search_keys.values()
|
|
@@ -3388,7 +3401,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3388
3401
|
self.autodetected_search_keys[maybe_key] = SearchKey.COUNTRY
|
|
3389
3402
|
self.logger.info(f"Autodetected search key COUNTRY in column {maybe_key}")
|
|
3390
3403
|
if not silent_mode:
|
|
3391
|
-
print(bundle.get("country_detected").format(maybe_key))
|
|
3404
|
+
print(self.bundle.get("country_detected").format(maybe_key))
|
|
3392
3405
|
|
|
3393
3406
|
if (
|
|
3394
3407
|
SearchKey.EMAIL not in search_keys.values()
|
|
@@ -3402,13 +3415,13 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3402
3415
|
self.autodetected_search_keys[maybe_key] = SearchKey.EMAIL
|
|
3403
3416
|
self.logger.info(f"Autodetected search key EMAIL in column {maybe_key}")
|
|
3404
3417
|
if not silent_mode:
|
|
3405
|
-
print(bundle.get("email_detected").format(maybe_key))
|
|
3418
|
+
print(self.bundle.get("email_detected").format(maybe_key))
|
|
3406
3419
|
else:
|
|
3407
3420
|
self.logger.warning(
|
|
3408
3421
|
f"Autodetected search key EMAIL in column {maybe_key}. But not used because not registered user"
|
|
3409
3422
|
)
|
|
3410
3423
|
if not silent_mode:
|
|
3411
|
-
print(bundle.get("email_detected_not_registered").format(maybe_key))
|
|
3424
|
+
print(self.bundle.get("email_detected_not_registered").format(maybe_key))
|
|
3412
3425
|
self.warning_counter.increment()
|
|
3413
3426
|
|
|
3414
3427
|
if SearchKey.PHONE not in search_keys.values() and check_need_detect(SearchKey.PHONE):
|
|
@@ -3419,20 +3432,20 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3419
3432
|
self.autodetected_search_keys[maybe_key] = SearchKey.PHONE
|
|
3420
3433
|
self.logger.info(f"Autodetected search key PHONE in column {maybe_key}")
|
|
3421
3434
|
if not silent_mode:
|
|
3422
|
-
print(bundle.get("phone_detected").format(maybe_key))
|
|
3435
|
+
print(self.bundle.get("phone_detected").format(maybe_key))
|
|
3423
3436
|
else:
|
|
3424
3437
|
self.logger.warning(
|
|
3425
3438
|
f"Autodetected search key PHONE in column {maybe_key}. But not used because not registered user"
|
|
3426
3439
|
)
|
|
3427
3440
|
if not silent_mode:
|
|
3428
|
-
print(bundle.get("phone_detected_not_registered"))
|
|
3441
|
+
print(self.bundle.get("phone_detected_not_registered"))
|
|
3429
3442
|
self.warning_counter.increment()
|
|
3430
3443
|
|
|
3431
3444
|
return search_keys
|
|
3432
3445
|
|
|
3433
3446
|
def _validate_binary_observations(self, y, task_type: ModelTaskType):
|
|
3434
3447
|
if task_type == ModelTaskType.BINARY and (y.value_counts() < 1000).any():
|
|
3435
|
-
msg = bundle.get("binary_small_dataset")
|
|
3448
|
+
msg = self.bundle.get("binary_small_dataset")
|
|
3436
3449
|
self.logger.warning(msg)
|
|
3437
3450
|
print(msg)
|
|
3438
3451
|
|
|
@@ -3447,8 +3460,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3447
3460
|
self.logger.exception("Failed to dump python libs")
|
|
3448
3461
|
|
|
3449
3462
|
def __display_support_link(self, link_text: Optional[str] = None):
|
|
3450
|
-
support_link = bundle.get("support_link")
|
|
3451
|
-
link_text = link_text or bundle.get("support_text")
|
|
3463
|
+
support_link = self.bundle.get("support_link")
|
|
3464
|
+
link_text = link_text or self.bundle.get("support_text")
|
|
3452
3465
|
try:
|
|
3453
3466
|
from IPython.display import HTML, display
|
|
3454
3467
|
|
|
@@ -3564,7 +3577,7 @@ def _num_samples(x):
|
|
|
3564
3577
|
raise TypeError(message) from type_error
|
|
3565
3578
|
|
|
3566
3579
|
|
|
3567
|
-
def is_frames_equal(first, second) -> bool:
|
|
3580
|
+
def is_frames_equal(first, second, bundle: ResourceBundle) -> bool:
|
|
3568
3581
|
if (isinstance(first, pd.DataFrame) and isinstance(second, pd.DataFrame)) or (
|
|
3569
3582
|
isinstance(first, pd.Series) and isinstance(second, pd.Series)
|
|
3570
3583
|
):
|