upgini 1.1.244a24__py3-none-any.whl → 1.1.245a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

@@ -50,7 +50,7 @@ from upgini.metadata import (
50
50
  SearchKey,
51
51
  )
52
52
  from upgini.metrics import EstimatorWrapper, validate_scoring_argument
53
- from upgini.resource_bundle import bundle
53
+ from upgini.resource_bundle import ResourceBundle, get_custom_bundle, bundle
54
54
  from upgini.search_task import SearchTask
55
55
  from upgini.spinner import Spinner
56
56
  from upgini.utils import combine_search_keys
@@ -186,8 +186,10 @@ class FeaturesEnricher(TransformerMixin):
186
186
  baseline_score_column: Optional[Any] = None,
187
187
  client_ip: Optional[str] = None,
188
188
  client_visitorid: Optional[str] = None,
189
+ custom_bundle_config: Optional[str] = None,
189
190
  **kwargs,
190
191
  ):
192
+ self.bundle = get_custom_bundle(custom_bundle_config)
191
193
  self._api_key = api_key or os.environ.get(UPGINI_API_KEY)
192
194
  if api_key is not None and not isinstance(api_key, str):
193
195
  raise ValidationError(f"api_key should be `string`, but passed: `{api_key}`")
@@ -240,23 +242,23 @@ class FeaturesEnricher(TransformerMixin):
240
242
  if search_id:
241
243
  search_task = SearchTask(search_id, rest_client=self.rest_client, logger=self.logger)
242
244
 
243
- print(bundle.get("search_by_task_id_start"))
245
+ print(self.bundle.get("search_by_task_id_start"))
244
246
  trace_id = str(uuid.uuid4())
245
247
  with MDC(trace_id=trace_id):
246
248
  try:
247
- self.logger.info(f"FeaturesEnricher created from existing search: {search_id}")
249
+ self.logger.debug(f"FeaturesEnricher created from existing search: {search_id}")
248
250
  self._search_task = search_task.poll_result(trace_id, quiet=True, check_fit=True)
249
251
  file_metadata = self._search_task.get_file_metadata(trace_id)
250
252
  x_columns = [c.originalName or c.name for c in file_metadata.columns]
251
253
  self.__prepare_feature_importances(trace_id, x_columns)
252
254
  # TODO validate search_keys with search_keys from file_metadata
253
- print(bundle.get("search_by_task_id_finish"))
254
- self.logger.info(f"Successfully initialized with search_id: {search_id}")
255
+ print(self.bundle.get("search_by_task_id_finish"))
256
+ self.logger.debug(f"Successfully initialized with search_id: {search_id}")
255
257
  except HttpError as e:
256
258
  if "Interrupted by client" in e.args[0]:
257
259
  raise ValidationError("Search was cancelled")
258
260
  except Exception as e:
259
- print(bundle.get("failed_search_by_task_id"))
261
+ print(self.bundle.get("failed_search_by_task_id"))
260
262
  self.logger.exception(f"Failed to find search_id: {search_id}")
261
263
  raise e
262
264
 
@@ -277,13 +279,13 @@ class FeaturesEnricher(TransformerMixin):
277
279
  self.round_embeddings = round_embeddings
278
280
  if generate_features is not None:
279
281
  if len(generate_features) > self.GENERATE_FEATURES_LIMIT:
280
- msg = bundle.get("too_many_generate_features").format(self.GENERATE_FEATURES_LIMIT)
282
+ msg = self.bundle.get("too_many_generate_features").format(self.GENERATE_FEATURES_LIMIT)
281
283
  self.logger.error(msg)
282
284
  raise ValidationError(msg)
283
285
  self.runtime_parameters.properties["generate_features"] = ",".join(generate_features)
284
286
  if round_embeddings is not None:
285
287
  if not isinstance(round_embeddings, int) or round_embeddings < 0:
286
- msg = bundle.get("invalid_round_embeddings")
288
+ msg = self.bundle.get("invalid_round_embeddings")
287
289
  self.logger.error(msg)
288
290
  raise ValidationError(msg)
289
291
  self.runtime_parameters.properties["round_embeddings"] = round_embeddings
@@ -309,7 +311,7 @@ class FeaturesEnricher(TransformerMixin):
309
311
  api_key = property(_get_api_key, _set_api_key)
310
312
 
311
313
  @staticmethod
312
- def _check_eval_set(eval_set, X):
314
+ def _check_eval_set(eval_set, X, bundle: ResourceBundle):
313
315
  checked_eval_set = []
314
316
  if eval_set is not None and isinstance(eval_set, tuple):
315
317
  eval_set = [eval_set]
@@ -318,7 +320,7 @@ class FeaturesEnricher(TransformerMixin):
318
320
  for eval_pair in eval_set or []:
319
321
  if not isinstance(eval_pair, tuple) or len(eval_pair) != 2:
320
322
  raise ValidationError(bundle.get("eval_set_invalid_tuple_size").format(len(eval_pair)))
321
- if not is_frames_equal(X, eval_pair[0]):
323
+ if not is_frames_equal(X, eval_pair[0], bundle):
322
324
  checked_eval_set.append(eval_pair)
323
325
  return checked_eval_set
324
326
 
@@ -401,7 +403,7 @@ class FeaturesEnricher(TransformerMixin):
401
403
  try:
402
404
  self.X = X
403
405
  self.y = y
404
- self.eval_set = self._check_eval_set(eval_set, X)
406
+ self.eval_set = self._check_eval_set(eval_set, X, self.bundle)
405
407
  self.dump_input(trace_id, X, y, eval_set)
406
408
  self.__inner_fit(
407
409
  trace_id,
@@ -439,7 +441,7 @@ class FeaturesEnricher(TransformerMixin):
439
441
  if len(e.args) > 0 and (
440
442
  "File doesn't intersect with any ADS" in str(e.args[0]) or "Empty intersection" in str(e.args[0])
441
443
  ):
442
- self.__display_support_link(bundle.get("features_info_zero_important_features"))
444
+ self.__display_support_link(self.bundle.get("features_info_zero_important_features"))
443
445
  elif isinstance(e, ValidationError):
444
446
  self._dump_python_libs()
445
447
  self._show_error(str(e))
@@ -540,11 +542,13 @@ class FeaturesEnricher(TransformerMixin):
540
542
  try:
541
543
  self.X = X
542
544
  self.y = y
543
- self.eval_set = self._check_eval_set(eval_set, X)
545
+ self.eval_set = self._check_eval_set(eval_set, X, self.bundle)
544
546
  self.dump_input(trace_id, X, y, eval_set)
545
547
 
546
548
  if _num_samples(drop_duplicates(X)) > Dataset.MAX_ROWS:
547
- raise ValidationError(bundle.get("dataset_too_many_rows_registered").format(Dataset.MAX_ROWS))
549
+ raise ValidationError(
550
+ self.bundle.get("dataset_too_many_rows_registered").format(Dataset.MAX_ROWS)
551
+ )
548
552
 
549
553
  self.__inner_fit(
550
554
  trace_id,
@@ -581,7 +585,7 @@ class FeaturesEnricher(TransformerMixin):
581
585
  if len(e.args) > 0 and (
582
586
  "File doesn't intersect with any ADS" in str(e.args[0]) or "Empty intersection" in str(e.args[0])
583
587
  ):
584
- self.__display_support_link(bundle.get("features_info_zero_important_features"))
588
+ self.__display_support_link(self.bundle.get("features_info_zero_important_features"))
585
589
  return None
586
590
  elif isinstance(e, ValidationError):
587
591
  self._dump_python_libs()
@@ -677,11 +681,11 @@ class FeaturesEnricher(TransformerMixin):
677
681
  self.__validate_search_keys(self.search_keys, self.search_id)
678
682
  try:
679
683
  if len(self.feature_names_) == 0:
680
- self.logger.warning(bundle.get("no_important_features_for_transform"))
684
+ self.logger.warning(self.bundle.get("no_important_features_for_transform"))
681
685
  return X
682
686
 
683
687
  if self._has_paid_features(exclude_features_sources):
684
- msg = bundle.get("transform_with_paid_features")
688
+ msg = self.bundle.get("transform_with_paid_features")
685
689
  self.logger.warning(msg)
686
690
  self.__display_support_link(msg)
687
691
  return None
@@ -691,13 +695,15 @@ class FeaturesEnricher(TransformerMixin):
691
695
  self.logger.info(f"Current transform usage: {transform_usage}. Transforming {len(X)} rows")
692
696
  if transform_usage.has_limit:
693
697
  if len(X) > transform_usage.rest_rows:
694
- msg = bundle.get("transform_usage_warning").format(len(X), transform_usage.rest_rows)
698
+ msg = self.bundle.get("transform_usage_warning").format(
699
+ len(X), transform_usage.rest_rows
700
+ )
695
701
  self.logger.warning(msg)
696
702
  print(msg)
697
703
  show_request_quote_button()
698
704
  return None
699
705
  else:
700
- msg = bundle.get("transform_usage_info").format(
706
+ msg = self.bundle.get("transform_usage_info").format(
701
707
  transform_usage.limit, transform_usage.transformed_rows
702
708
  )
703
709
  self.logger.info("transform_usage_warning")
@@ -735,13 +741,13 @@ class FeaturesEnricher(TransformerMixin):
735
741
  if len(e.args) > 0 and (
736
742
  "File doesn't intersect with any ADS" in str(e.args[0]) or "Empty intersection" in str(e.args[0])
737
743
  ):
738
- self.__display_support_link(bundle.get("features_info_zero_important_features"))
744
+ self.__display_support_link(self.bundle.get("features_info_zero_important_features"))
739
745
  return None
740
746
  elif len(e.args) > 0 and (
741
747
  "You have reached the quota limit of trial data usage" in str(e.args[0])
742
748
  or "Current user hasn't access to trial features" in str(e.args[0])
743
749
  ):
744
- self.__display_support_link(bundle.get("trial_quota_limit_riched"))
750
+ self.__display_support_link(self.bundle.get("trial_quota_limit_riched"))
745
751
  return None
746
752
  elif isinstance(e, ValidationError):
747
753
  self._dump_python_libs()
@@ -858,7 +864,7 @@ class FeaturesEnricher(TransformerMixin):
858
864
  or (self.X is None and X is None)
859
865
  or (self.y is None and y is None)
860
866
  ):
861
- raise ValidationError(bundle.get("metrics_unfitted_enricher"))
867
+ raise ValidationError(self.bundle.get("metrics_unfitted_enricher"))
862
868
 
863
869
  if X is not None and y is None:
864
870
  raise ValidationError("X passed without y")
@@ -866,18 +872,12 @@ class FeaturesEnricher(TransformerMixin):
866
872
  effective_X = X if X is not None else self.X
867
873
  effective_eval_set = eval_set if eval_set is not None else self.eval_set
868
874
 
869
- effective_X = X if X is not None else self.X
870
- effective_eval_set = eval_set if eval_set is not None else self.eval_set
871
-
872
- effective_X = X if X is not None else self.X
873
- effective_eval_set = eval_set if eval_set is not None else self.eval_set
874
-
875
875
  validate_scoring_argument(scoring)
876
876
 
877
877
  self._validate_baseline_score(effective_X, effective_eval_set)
878
878
 
879
879
  if self._has_paid_features(exclude_features_sources):
880
- msg = bundle.get("metrics_with_paid_features")
880
+ msg = self.bundle.get("metrics_with_paid_features")
881
881
  self.logger.warning(msg)
882
882
  self.__display_support_link(msg)
883
883
  return None
@@ -898,7 +898,7 @@ class FeaturesEnricher(TransformerMixin):
898
898
  if self.search_keys[cat_feature] in [SearchKey.COUNTRY, SearchKey.POSTAL_CODE]:
899
899
  search_keys_for_metrics.append(cat_feature)
900
900
  else:
901
- raise ValidationError(bundle.get("cat_feature_search_key").format(cat_feature))
901
+ raise ValidationError(self.bundle.get("cat_feature_search_key").format(cat_feature))
902
902
 
903
903
  prepared_data = self._prepare_data_for_metrics(
904
904
  trace_id=trace_id,
@@ -928,10 +928,10 @@ class FeaturesEnricher(TransformerMixin):
928
928
 
929
929
  gc.collect()
930
930
 
931
- print(bundle.get("metrics_start"))
931
+ print(self.bundle.get("metrics_start"))
932
932
  with Spinner():
933
933
  if fitting_X.shape[1] == 0 and fitting_enriched_X.shape[1] == 0:
934
- print(bundle.get("metrics_no_important_free_features"))
934
+ print(self.bundle.get("metrics_no_important_free_features"))
935
935
  self.logger.warning("No client or free relevant ADS features found to calculate metrics")
936
936
  self.warning_counter.increment()
937
937
  return None
@@ -1025,20 +1025,25 @@ class FeaturesEnricher(TransformerMixin):
1025
1025
  effective_X = X if X is not None else self.X
1026
1026
  effective_y = y if y is not None else self.y
1027
1027
  train_metrics = {
1028
- bundle.get("quality_metrics_segment_header"): bundle.get("quality_metrics_train_segment"),
1029
- bundle.get("quality_metrics_rows_header"): _num_samples(effective_X),
1030
- # bundle.get("quality_metrics_match_rate_header"): self._search_task.initial_max_hit_rate_v2(),
1028
+ self.bundle.get("quality_metrics_segment_header"): self.bundle.get(
1029
+ "quality_metrics_train_segment"
1030
+ ),
1031
+ self.bundle.get("quality_metrics_rows_header"): _num_samples(effective_X),
1031
1032
  }
1032
1033
  if model_task_type in [ModelTaskType.BINARY, ModelTaskType.REGRESSION] and is_numeric_dtype(
1033
1034
  y_sorted
1034
1035
  ):
1035
- train_metrics[bundle.get("quality_metrics_mean_target_header")] = round(np.mean(effective_y), 4)
1036
+ train_metrics[self.bundle.get("quality_metrics_mean_target_header")] = round(
1037
+ np.mean(effective_y), 4
1038
+ )
1036
1039
  if etalon_metric is not None:
1037
- train_metrics[bundle.get("quality_metrics_baseline_header").format(metric)] = etalon_metric
1040
+ train_metrics[self.bundle.get("quality_metrics_baseline_header").format(metric)] = etalon_metric
1038
1041
  if enriched_metric is not None:
1039
- train_metrics[bundle.get("quality_metrics_enriched_header").format(metric)] = enriched_metric
1042
+ train_metrics[
1043
+ self.bundle.get("quality_metrics_enriched_header").format(metric)
1044
+ ] = enriched_metric
1040
1045
  if uplift is not None:
1041
- train_metrics[bundle.get("quality_metrics_uplift_header")] = uplift
1046
+ train_metrics[self.bundle.get("quality_metrics_uplift_header")] = uplift
1042
1047
  metrics = [train_metrics]
1043
1048
 
1044
1049
  # 3 If eval_set is presented - fit final model on train enriched data and score each
@@ -1090,40 +1095,42 @@ class FeaturesEnricher(TransformerMixin):
1090
1095
 
1091
1096
  effective_eval_set = eval_set if eval_set is not None else self.eval_set
1092
1097
  eval_metrics = {
1093
- bundle.get("quality_metrics_segment_header"): bundle.get(
1098
+ self.bundle.get("quality_metrics_segment_header"): self.bundle.get(
1094
1099
  "quality_metrics_eval_segment"
1095
1100
  ).format(idx + 1),
1096
- bundle.get("quality_metrics_rows_header"): _num_samples(effective_eval_set[idx][0]),
1097
- # bundle.get("quality_metrics_match_rate_header"): eval_hit_rate,
1101
+ self.bundle.get("quality_metrics_rows_header"): _num_samples(
1102
+ effective_eval_set[idx][0]
1103
+ ),
1104
+ # self.bundle.get("quality_metrics_match_rate_header"): eval_hit_rate,
1098
1105
  }
1099
1106
  if model_task_type in [ModelTaskType.BINARY, ModelTaskType.REGRESSION] and is_numeric_dtype(
1100
1107
  eval_y_sorted
1101
1108
  ):
1102
- eval_metrics[bundle.get("quality_metrics_mean_target_header")] = round(
1109
+ eval_metrics[self.bundle.get("quality_metrics_mean_target_header")] = round(
1103
1110
  np.mean(effective_eval_set[idx][1]), 4
1104
1111
  )
1105
1112
  if etalon_eval_metric is not None:
1106
1113
  eval_metrics[
1107
- bundle.get("quality_metrics_baseline_header").format(metric)
1114
+ self.bundle.get("quality_metrics_baseline_header").format(metric)
1108
1115
  ] = etalon_eval_metric
1109
1116
  if enriched_eval_metric is not None:
1110
1117
  eval_metrics[
1111
- bundle.get("quality_metrics_enriched_header").format(metric)
1118
+ self.bundle.get("quality_metrics_enriched_header").format(metric)
1112
1119
  ] = enriched_eval_metric
1113
1120
  if eval_uplift is not None:
1114
- eval_metrics[bundle.get("quality_metrics_uplift_header")] = eval_uplift
1121
+ eval_metrics[self.bundle.get("quality_metrics_uplift_header")] = eval_uplift
1115
1122
 
1116
1123
  metrics.append(eval_metrics)
1117
1124
 
1118
1125
  metrics_df = pd.DataFrame(metrics)
1119
- mean_target_hdr = bundle.get("quality_metrics_mean_target_header")
1126
+ mean_target_hdr = self.bundle.get("quality_metrics_mean_target_header")
1120
1127
  if mean_target_hdr in metrics_df.columns:
1121
1128
  metrics_df[mean_target_hdr] = metrics_df[mean_target_hdr].astype("float64")
1122
1129
  do_without_pandas_limits(
1123
1130
  lambda: self.logger.info(f"Metrics calculation finished successfully:\n{metrics_df}")
1124
1131
  )
1125
1132
 
1126
- uplift_col = bundle.get("quality_metrics_uplift_header")
1133
+ uplift_col = self.bundle.get("quality_metrics_uplift_header")
1127
1134
  date_column = self._get_date_column(search_keys)
1128
1135
  if (
1129
1136
  uplift_col in metrics_df.columns
@@ -1133,7 +1140,7 @@ class FeaturesEnricher(TransformerMixin):
1133
1140
  and date_column is not None
1134
1141
  and is_time_series(validated_X, date_column)
1135
1142
  ):
1136
- msg = bundle.get("metrics_negative_uplift_without_cv")
1143
+ msg = self.bundle.get("metrics_negative_uplift_without_cv")
1137
1144
  self.logger.warning(msg)
1138
1145
  self.__display_support_link(msg)
1139
1146
  elif uplift_col in metrics_df.columns and (metrics_df[uplift_col] < 0).any():
@@ -1149,7 +1156,7 @@ class FeaturesEnricher(TransformerMixin):
1149
1156
  "You have reached the quota limit of trial data usage" in str(e.args[0])
1150
1157
  or "Current user hasn't access to trial features" in str(e.args[0])
1151
1158
  ):
1152
- self.__display_support_link(bundle.get("trial_quota_limit_riched"))
1159
+ self.__display_support_link(self.bundle.get("trial_quota_limit_riched"))
1153
1160
  elif isinstance(e, ValidationError):
1154
1161
  self._dump_python_libs()
1155
1162
  self._show_error(str(e))
@@ -1171,7 +1178,7 @@ class FeaturesEnricher(TransformerMixin):
1171
1178
  if res[1] < 0.05:
1172
1179
  uneven_distribution = True
1173
1180
  if uneven_distribution:
1174
- msg = bundle.get("uneven_eval_target_distribution")
1181
+ msg = self.bundle.get("uneven_eval_target_distribution")
1175
1182
  print(msg)
1176
1183
  self.logger.warning(msg)
1177
1184
 
@@ -1185,14 +1192,14 @@ class FeaturesEnricher(TransformerMixin):
1185
1192
  ) -> List[str]:
1186
1193
  if exclude_features_sources:
1187
1194
  filtered_features_info = self.features_info[
1188
- ~self.features_info[bundle.get("features_info_name")].isin(exclude_features_sources)
1195
+ ~self.features_info[self.bundle.get("features_info_name")].isin(exclude_features_sources)
1189
1196
  ]
1190
1197
  else:
1191
1198
  filtered_features_info = self.features_info
1192
1199
  return list(
1193
1200
  filtered_features_info.loc[
1194
- filtered_features_info[bundle.get("features_info_commercial_schema")] == commercial_schema,
1195
- bundle.get("features_info_name"),
1201
+ filtered_features_info[self.bundle.get("features_info_commercial_schema")] == commercial_schema,
1202
+ self.bundle.get("features_info_name"),
1196
1203
  ].values
1197
1204
  )
1198
1205
 
@@ -1239,7 +1246,7 @@ class FeaturesEnricher(TransformerMixin):
1239
1246
  if X is None:
1240
1247
  return True, self.X, self.y, self.eval_set
1241
1248
 
1242
- checked_eval_set = self._check_eval_set(eval_set, X)
1249
+ checked_eval_set = self._check_eval_set(eval_set, X, self.bundle)
1243
1250
 
1244
1251
  if (
1245
1252
  X is self.X
@@ -1280,7 +1287,7 @@ class FeaturesEnricher(TransformerMixin):
1280
1287
  is_demo_dataset = hash_input(X, y, eval_set) in DEMO_DATASET_HASHES
1281
1288
  validated_X = self._validate_X(X)
1282
1289
  validated_y = self._validate_y(validated_X, y)
1283
- checked_eval_set = self._check_eval_set(eval_set, X)
1290
+ checked_eval_set = self._check_eval_set(eval_set, X, self.bundle)
1284
1291
  validated_eval_set = (
1285
1292
  [self._validate_eval_set_pair(validated_X, eval_set_pair) for eval_set_pair in checked_eval_set]
1286
1293
  if checked_eval_set
@@ -1409,7 +1416,7 @@ class FeaturesEnricher(TransformerMixin):
1409
1416
  return self.__sample_balanced(eval_set, trace_id, remove_outliers_calc_metrics)
1410
1417
  else:
1411
1418
  self.logger.info("Dataset is imbalanced or exclude_features_sources or X was passed. Run transform")
1412
- print(bundle.get("prepare_data_for_metrics"))
1419
+ print(self.bundle.get("prepare_data_for_metrics"))
1413
1420
  return self.__sample_imbalanced(
1414
1421
  validated_X,
1415
1422
  validated_y,
@@ -1503,7 +1510,7 @@ class FeaturesEnricher(TransformerMixin):
1503
1510
  not_msg = ""
1504
1511
  else:
1505
1512
  not_msg = "not "
1506
- msg = bundle.get("target_outliers_warning").format(len(target_outliers_df), top_outliers, not_msg)
1513
+ msg = self.bundle.get("target_outliers_warning").format(len(target_outliers_df), top_outliers, not_msg)
1507
1514
  print(msg)
1508
1515
  self.logger.warning(msg)
1509
1516
 
@@ -1529,7 +1536,7 @@ class FeaturesEnricher(TransformerMixin):
1529
1536
  if eval_set is not None:
1530
1537
  if len(enriched_eval_sets) != len(eval_set):
1531
1538
  raise ValidationError(
1532
- bundle.get("metrics_eval_set_count_diff").format(len(enriched_eval_sets), len(eval_set))
1539
+ self.bundle.get("metrics_eval_set_count_diff").format(len(enriched_eval_sets), len(eval_set))
1533
1540
  )
1534
1541
 
1535
1542
  for idx in range(len(eval_set)):
@@ -1680,7 +1687,7 @@ class FeaturesEnricher(TransformerMixin):
1680
1687
  def get_features_info(self) -> pd.DataFrame:
1681
1688
  """Returns pandas.DataFrame with SHAP values and other info for each feature."""
1682
1689
  if self._search_task is None or self._search_task.summary is None:
1683
- msg = bundle.get("features_unfitted_enricher")
1690
+ msg = self.bundle.get("features_unfitted_enricher")
1684
1691
  self.logger.warning(msg)
1685
1692
  raise NotFittedError(msg)
1686
1693
 
@@ -1694,9 +1701,9 @@ class FeaturesEnricher(TransformerMixin):
1694
1701
 
1695
1702
  def get_transactional_transform_api(self):
1696
1703
  if self.api_key is None:
1697
- raise ValidationError(bundle.get("transactional_transform_unregistered"))
1704
+ raise ValidationError(self.bundle.get("transactional_transform_unregistered"))
1698
1705
  if self._search_task is None:
1699
- raise ValidationError(bundle.get("transactional_transform_unfited"))
1706
+ raise ValidationError(self.bundle.get("transactional_transform_unfited"))
1700
1707
 
1701
1708
  def key_example(key: SearchKey):
1702
1709
  if key == SearchKey.COUNTRY:
@@ -1761,7 +1768,7 @@ class FeaturesEnricher(TransformerMixin):
1761
1768
  ) -> pd.DataFrame:
1762
1769
  with MDC(trace_id=trace_id):
1763
1770
  if self._search_task is None:
1764
- raise NotFittedError(bundle.get("transform_unfitted_enricher"))
1771
+ raise NotFittedError(self.bundle.get("transform_unfitted_enricher"))
1765
1772
 
1766
1773
  validated_X = self._validate_X(X, is_transform=True)
1767
1774
 
@@ -1773,13 +1780,13 @@ class FeaturesEnricher(TransformerMixin):
1773
1780
  and not self.__is_registered
1774
1781
  and not is_demo_dataset
1775
1782
  ):
1776
- msg = bundle.get("transform_with_trial_features")
1783
+ msg = self.bundle.get("transform_with_trial_features")
1777
1784
  self.logger.warning(msg)
1778
1785
  print(msg)
1779
1786
 
1780
1787
  columns_to_drop = [c for c in validated_X.columns if c in self.feature_names_]
1781
1788
  if len(columns_to_drop) > 0:
1782
- msg = bundle.get("x_contains_enriching_columns").format(columns_to_drop)
1789
+ msg = self.bundle.get("x_contains_enriching_columns").format(columns_to_drop)
1783
1790
  self.logger.warning(msg)
1784
1791
  print(msg)
1785
1792
  validated_X = validated_X.drop(columns=columns_to_drop)
@@ -1796,7 +1803,7 @@ class FeaturesEnricher(TransformerMixin):
1796
1803
  df = self.__handle_index_search_keys(df, search_keys)
1797
1804
 
1798
1805
  if DEFAULT_INDEX in df.columns:
1799
- msg = bundle.get("unsupported_index_column")
1806
+ msg = self.bundle.get("unsupported_index_column")
1800
1807
  self.logger.info(msg)
1801
1808
  print(msg)
1802
1809
  df.drop(columns=DEFAULT_INDEX, inplace=True)
@@ -1909,9 +1916,9 @@ class FeaturesEnricher(TransformerMixin):
1909
1916
  gc.collect()
1910
1917
 
1911
1918
  if not silent_mode:
1912
- print(bundle.get("polling_search_task").format(validation_task.search_task_id))
1919
+ print(self.bundle.get("polling_search_task").format(validation_task.search_task_id))
1913
1920
  if not self.__is_registered:
1914
- print(bundle.get("polling_unregister_information"))
1921
+ print(self.bundle.get("polling_unregister_information"))
1915
1922
 
1916
1923
  progress = self.get_progress(trace_id, validation_task)
1917
1924
  progress.recalculate_eta(time.time() - start_time)
@@ -1937,10 +1944,10 @@ class FeaturesEnricher(TransformerMixin):
1937
1944
  time.sleep(polling_period_seconds)
1938
1945
  progress = self.get_progress(trace_id, validation_task)
1939
1946
  except KeyboardInterrupt as e:
1940
- print(bundle.get("search_stopping"))
1947
+ print(self.bundle.get("search_stopping"))
1941
1948
  self.rest_client.stop_search_task_v2(trace_id, validation_task.search_task_id)
1942
1949
  self.logger.warning(f"Search {validation_task.search_task_id} stopped by user")
1943
- print(bundle.get("search_stopped"))
1950
+ print(self.bundle.get("search_stopped"))
1944
1951
  raise e
1945
1952
 
1946
1953
  validation_task.poll_result(trace_id, quiet=True)
@@ -1962,7 +1969,7 @@ class FeaturesEnricher(TransformerMixin):
1962
1969
  return res
1963
1970
 
1964
1971
  if not silent_mode:
1965
- print(bundle.get("transform_start"))
1972
+ print(self.bundle.get("transform_start"))
1966
1973
  # with Spinner():
1967
1974
  result = enrich()
1968
1975
  else:
@@ -1976,9 +1983,9 @@ class FeaturesEnricher(TransformerMixin):
1976
1983
 
1977
1984
  def _get_excluded_features(self, max_features: Optional[int], importance_threshold: Optional[float]) -> List[str]:
1978
1985
  features_info = self._internal_features_info
1979
- comm_schema_header = bundle.get("features_info_commercial_schema")
1980
- shap_value_header = bundle.get("features_info_shap")
1981
- feature_name_header = bundle.get("features_info_name")
1986
+ comm_schema_header = self.bundle.get("features_info_commercial_schema")
1987
+ shap_value_header = self.bundle.get("features_info_shap")
1988
+ feature_name_header = self.bundle.get("features_info_name")
1982
1989
  external_features = features_info[features_info[comm_schema_header].str.len() > 0]
1983
1990
  filtered_features = external_features
1984
1991
  if importance_threshold is not None:
@@ -2009,28 +2016,28 @@ class FeaturesEnricher(TransformerMixin):
2009
2016
  return
2010
2017
  else:
2011
2018
  self.logger.warning("search_keys not provided")
2012
- raise ValidationError(bundle.get("empty_search_keys"))
2019
+ raise ValidationError(self.bundle.get("empty_search_keys"))
2013
2020
 
2014
2021
  key_types = search_keys.values()
2015
2022
 
2016
2023
  if SearchKey.DATE in key_types and SearchKey.DATETIME in key_types:
2017
- msg = bundle.get("date_and_datetime_simultanious")
2024
+ msg = self.bundle.get("date_and_datetime_simultanious")
2018
2025
  self.logger.warning(msg)
2019
2026
  raise ValidationError(msg)
2020
2027
 
2021
2028
  if SearchKey.EMAIL in key_types and SearchKey.HEM in key_types:
2022
- msg = bundle.get("email_and_hem_simultanious")
2029
+ msg = self.bundle.get("email_and_hem_simultanious")
2023
2030
  self.logger.warning(msg)
2024
2031
  raise ValidationError(msg)
2025
2032
 
2026
2033
  if SearchKey.POSTAL_CODE in key_types and SearchKey.COUNTRY not in key_types and self.country_code is None:
2027
- msg = bundle.get("postal_code_without_country")
2034
+ msg = self.bundle.get("postal_code_without_country")
2028
2035
  self.logger.warning(msg)
2029
2036
  raise ValidationError(msg)
2030
2037
 
2031
2038
  for key_type in SearchKey.__members__.values():
2032
2039
  if key_type != SearchKey.CUSTOM_KEY and list(key_types).count(key_type) > 1:
2033
- msg = bundle.get("multiple_search_key").format(key_type)
2040
+ msg = self.bundle.get("multiple_search_key").format(key_type)
2034
2041
  self.logger.warning(msg)
2035
2042
  raise ValidationError(msg)
2036
2043
 
@@ -2040,7 +2047,7 @@ class FeaturesEnricher(TransformerMixin):
2040
2047
  # and not is_demo_dataset
2041
2048
  # and len(set(key_types).intersection(non_personal_keys)) == 0
2042
2049
  # ):
2043
- # msg = bundle.get("unregistered_only_personal_keys")
2050
+ # msg = self.bundle.get("unregistered_only_personal_keys")
2044
2051
  # self.logger.warning(msg + f" Provided search keys: {key_types}")
2045
2052
  # raise ValidationError(msg)
2046
2053
 
@@ -2081,19 +2088,22 @@ class FeaturesEnricher(TransformerMixin):
2081
2088
  )
2082
2089
  is_demo_dataset = hash_input(validated_X, validated_y, validated_eval_set) in DEMO_DATASET_HASHES
2083
2090
  if is_demo_dataset:
2084
- msg = bundle.get("demo_dataset_info")
2091
+ msg = self.bundle.get("demo_dataset_info")
2085
2092
  self.logger.info(msg)
2086
2093
  if not self.__is_registered:
2087
2094
  print(msg)
2088
2095
 
2089
2096
  if self.generate_features is not None and len(self.generate_features) > 0:
2090
2097
  x_columns = list(validated_X.columns)
2098
+ checked_generate_features = []
2091
2099
  for gen_feature in self.generate_features:
2092
2100
  if gen_feature not in x_columns:
2093
- self.generate_features.remove(gen_feature)
2094
- msg = bundle.get("missing_generate_feature").format(gen_feature, x_columns)
2101
+ msg = self.bundle.get("missing_generate_feature").format(gen_feature, x_columns)
2095
2102
  print(msg)
2096
2103
  self.logger.warning(msg)
2104
+ else:
2105
+ checked_generate_features.append(gen_feature)
2106
+ self.generate_features = checked_generate_features
2097
2107
  self.runtime_parameters.properties["generate_features"] = ",".join(self.generate_features)
2098
2108
 
2099
2109
  validate_scoring_argument(scoring)
@@ -2134,7 +2144,7 @@ class FeaturesEnricher(TransformerMixin):
2134
2144
  df = pd.concat([df, eval_df])
2135
2145
 
2136
2146
  if DEFAULT_INDEX in df.columns:
2137
- msg = bundle.get("unsupported_index_column")
2147
+ msg = self.bundle.get("unsupported_index_column")
2138
2148
  self.logger.info(msg)
2139
2149
  print(msg)
2140
2150
  self.fit_dropped_features.add(DEFAULT_INDEX)
@@ -2237,9 +2247,9 @@ class FeaturesEnricher(TransformerMixin):
2237
2247
  if search_id_callback is not None:
2238
2248
  search_id_callback(self._search_task.search_task_id)
2239
2249
 
2240
- print(bundle.get("polling_search_task").format(self._search_task.search_task_id))
2250
+ print(self.bundle.get("polling_search_task").format(self._search_task.search_task_id))
2241
2251
  if not self.__is_registered:
2242
- print(bundle.get("polling_unregister_information"))
2252
+ print(self.bundle.get("polling_unregister_information"))
2243
2253
 
2244
2254
  progress = self.get_progress(trace_id)
2245
2255
  prev_progress = None
@@ -2265,14 +2275,14 @@ class FeaturesEnricher(TransformerMixin):
2265
2275
  f"Search {self._search_task.search_task_id} failed with error {progress.error}"
2266
2276
  f" and message {progress.error_message}"
2267
2277
  )
2268
- raise RuntimeError(bundle.get("search_task_failed_status"))
2278
+ raise RuntimeError(self.bundle.get("search_task_failed_status"))
2269
2279
  time.sleep(poll_period_seconds)
2270
2280
  progress = self.get_progress(trace_id)
2271
2281
  except KeyboardInterrupt as e:
2272
- print(bundle.get("search_stopping"))
2282
+ print(self.bundle.get("search_stopping"))
2273
2283
  self.rest_client.stop_search_task_v2(trace_id, self._search_task.search_task_id)
2274
2284
  self.logger.warning(f"Search {self._search_task.search_task_id} stopped by user")
2275
- print(bundle.get("search_stopped"))
2285
+ print(self.bundle.get("search_stopped"))
2276
2286
  raise e
2277
2287
 
2278
2288
  self._search_task.poll_result(trace_id, quiet=True)
@@ -2293,7 +2303,7 @@ class FeaturesEnricher(TransformerMixin):
2293
2303
  )
2294
2304
  zero_hit_columns = self.get_columns_by_search_keys(zero_hit_search_keys)
2295
2305
  if zero_hit_columns:
2296
- msg = bundle.get("features_info_zero_hit_rate_search_keys").format(zero_hit_columns)
2306
+ msg = self.bundle.get("features_info_zero_hit_rate_search_keys").format(zero_hit_columns)
2297
2307
  self.logger.warning(msg)
2298
2308
  self.__display_support_link(msg)
2299
2309
  self.warning_counter.increment()
@@ -2305,7 +2315,7 @@ class FeaturesEnricher(TransformerMixin):
2305
2315
  unused_features_for_generation = [
2306
2316
  dataset.columns_renaming.get(col) or col for col in self._search_task.unused_features_for_generation
2307
2317
  ]
2308
- msg = bundle.get("features_not_generated").format(unused_features_for_generation)
2318
+ msg = self.bundle.get("features_not_generated").format(unused_features_for_generation)
2309
2319
  self.logger.warning(msg)
2310
2320
  print(msg)
2311
2321
  self.warning_counter.increment()
@@ -2320,7 +2330,7 @@ class FeaturesEnricher(TransformerMixin):
2320
2330
 
2321
2331
  if self._has_paid_features(exclude_features_sources):
2322
2332
  if calculate_metrics is not None and calculate_metrics:
2323
- msg = bundle.get("metrics_with_paid_features")
2333
+ msg = self.bundle.get("metrics_with_paid_features")
2324
2334
  self.logger.warning(msg)
2325
2335
  self.__display_support_link(msg)
2326
2336
  else:
@@ -2331,7 +2341,7 @@ class FeaturesEnricher(TransformerMixin):
2331
2341
  if len(validated_X) < self.CALCULATE_METRICS_MIN_THRESHOLD or any(
2332
2342
  [len(eval_X) < self.CALCULATE_METRICS_MIN_THRESHOLD for eval_X, _ in validated_eval_set]
2333
2343
  ):
2334
- msg = bundle.get("too_small_for_metrics")
2344
+ msg = self.bundle.get("too_small_for_metrics")
2335
2345
  self.logger.warning(msg)
2336
2346
  calculate_metrics = False
2337
2347
  elif len(dataset) * len(dataset.columns) > self.CALCULATE_METRICS_THRESHOLD:
@@ -2362,7 +2372,7 @@ class FeaturesEnricher(TransformerMixin):
2362
2372
  self.__show_report_button()
2363
2373
 
2364
2374
  if not self.warning_counter.has_warnings():
2365
- self.__display_support_link(bundle.get("all_ok_community_invite"))
2375
+ self.__display_support_link(self.bundle.get("all_ok_community_invite"))
2366
2376
 
2367
2377
  def __adjust_cv(self, df: pd.DataFrame, date_column: pd.Series, model_task_type: ModelTaskType):
2368
2378
  # Check Multivariate time series
@@ -2373,14 +2383,14 @@ class FeaturesEnricher(TransformerMixin):
2373
2383
  and len({SearchKey.PHONE, SearchKey.EMAIL, SearchKey.HEM}.intersection(self.fit_search_keys.keys())) == 0
2374
2384
  and is_blocked_time_series(df, date_column, list(self.fit_search_keys.keys()) + [TARGET])
2375
2385
  ):
2376
- msg = bundle.get("multivariate_timeseries_detected")
2386
+ msg = self.bundle.get("multivariate_timeseries_detected")
2377
2387
  self.__override_cv(CVType.blocked_time_series, msg, print_warning=False)
2378
2388
  elif (
2379
2389
  self.cv is None
2380
2390
  and model_task_type != ModelTaskType.REGRESSION
2381
2391
  and self._get_group_columns(df, self.fit_search_keys)
2382
2392
  ):
2383
- msg = bundle.get("group_k_fold_in_classification")
2393
+ msg = self.bundle.get("group_k_fold_in_classification")
2384
2394
  self.__override_cv(CVType.group_k_fold, msg, print_warning=self.cv is not None)
2385
2395
 
2386
2396
  def __override_cv(self, cv: CVType, msg: str, print_warning: bool = True):
@@ -2400,11 +2410,11 @@ class FeaturesEnricher(TransformerMixin):
2400
2410
 
2401
2411
  def _validate_X(self, X, is_transform=False) -> pd.DataFrame:
2402
2412
  if _num_samples(X) == 0:
2403
- raise ValidationError(bundle.get("x_is_empty"))
2413
+ raise ValidationError(self.bundle.get("x_is_empty"))
2404
2414
 
2405
2415
  if isinstance(X, pd.DataFrame):
2406
2416
  if isinstance(X.columns, pd.MultiIndex) or isinstance(X.index, pd.MultiIndex):
2407
- raise ValidationError(bundle.get("x_multiindex_unsupported"))
2417
+ raise ValidationError(self.bundle.get("x_multiindex_unsupported"))
2408
2418
  validated_X = X.copy()
2409
2419
  elif isinstance(X, pd.Series):
2410
2420
  validated_X = X.to_frame()
@@ -2413,12 +2423,12 @@ class FeaturesEnricher(TransformerMixin):
2413
2423
  renaming = {c: str(c) for c in validated_X.columns}
2414
2424
  validated_X = validated_X.rename(columns=renaming)
2415
2425
  else:
2416
- raise ValidationError(bundle.get("unsupported_x_type").format(type(X)))
2426
+ raise ValidationError(self.bundle.get("unsupported_x_type").format(type(X)))
2417
2427
 
2418
2428
  if len(set(validated_X.columns)) != len(validated_X.columns):
2419
- raise ValidationError(bundle.get("x_contains_dup_columns"))
2429
+ raise ValidationError(self.bundle.get("x_contains_dup_columns"))
2420
2430
  if not is_transform and not validated_X.index.is_unique:
2421
- raise ValidationError(bundle.get("x_non_unique_index"))
2431
+ raise ValidationError(self.bundle.get("x_non_unique_index"))
2422
2432
 
2423
2433
  if self.exclude_columns is not None:
2424
2434
  validated_X = validated_X.drop(columns=self.exclude_columns, errors="ignore")
@@ -2429,17 +2439,17 @@ class FeaturesEnricher(TransformerMixin):
2429
2439
  )
2430
2440
 
2431
2441
  if TARGET in validated_X.columns:
2432
- raise ValidationError(bundle.get("x_contains_reserved_column_name").format(TARGET))
2442
+ raise ValidationError(self.bundle.get("x_contains_reserved_column_name").format(TARGET))
2433
2443
  if not is_transform and EVAL_SET_INDEX in validated_X.columns:
2434
- raise ValidationError(bundle.get("x_contains_reserved_column_name").format(EVAL_SET_INDEX))
2444
+ raise ValidationError(self.bundle.get("x_contains_reserved_column_name").format(EVAL_SET_INDEX))
2435
2445
  if SYSTEM_RECORD_ID in validated_X.columns:
2436
- raise ValidationError(bundle.get("x_contains_reserved_column_name").format(SYSTEM_RECORD_ID))
2446
+ raise ValidationError(self.bundle.get("x_contains_reserved_column_name").format(SYSTEM_RECORD_ID))
2437
2447
 
2438
2448
  return validated_X
2439
2449
 
2440
2450
  def _validate_y(self, X: pd.DataFrame, y) -> pd.Series:
2441
2451
  if _num_samples(y) == 0:
2442
- raise ValidationError(bundle.get("y_is_empty"))
2452
+ raise ValidationError(self.bundle.get("y_is_empty"))
2443
2453
 
2444
2454
  if (
2445
2455
  not isinstance(y, pd.Series)
@@ -2447,26 +2457,26 @@ class FeaturesEnricher(TransformerMixin):
2447
2457
  and not isinstance(y, np.ndarray)
2448
2458
  and not isinstance(y, list)
2449
2459
  ):
2450
- raise ValidationError(bundle.get("unsupported_y_type").format(type(y)))
2460
+ raise ValidationError(self.bundle.get("unsupported_y_type").format(type(y)))
2451
2461
 
2452
2462
  if _num_samples(X) != _num_samples(y):
2453
- raise ValidationError(bundle.get("x_and_y_diff_size").format(_num_samples(X), _num_samples(y)))
2463
+ raise ValidationError(self.bundle.get("x_and_y_diff_size").format(_num_samples(X), _num_samples(y)))
2454
2464
 
2455
2465
  if isinstance(y, pd.DataFrame):
2456
2466
  if len(y.columns) != 1:
2457
- raise ValidationError(bundle.get("y_invalid_dimension_dataframe"))
2467
+ raise ValidationError(self.bundle.get("y_invalid_dimension_dataframe"))
2458
2468
  if isinstance(y.columns, pd.MultiIndex) or isinstance(y.index, pd.MultiIndex):
2459
- raise ValidationError(bundle.get("y_multiindex_unsupported"))
2469
+ raise ValidationError(self.bundle.get("y_multiindex_unsupported"))
2460
2470
  y = y[y.columns[0]]
2461
2471
 
2462
2472
  if isinstance(y, pd.Series):
2463
2473
  if (y.index != X.index).any():
2464
- raise ValidationError(bundle.get("x_and_y_diff_index"))
2474
+ raise ValidationError(self.bundle.get("x_and_y_diff_index"))
2465
2475
  validated_y = y.copy()
2466
2476
  validated_y.rename(TARGET, inplace=True)
2467
2477
  elif isinstance(y, np.ndarray):
2468
2478
  if y.ndim != 1:
2469
- raise ValidationError(bundle.get("y_invalid_dimension_array"))
2479
+ raise ValidationError(self.bundle.get("y_invalid_dimension_array"))
2470
2480
  Xy = X.copy()
2471
2481
  Xy[TARGET] = y
2472
2482
  validated_y = Xy[TARGET].copy()
@@ -2476,24 +2486,24 @@ class FeaturesEnricher(TransformerMixin):
2476
2486
  validated_y = Xy[TARGET].copy()
2477
2487
 
2478
2488
  if validated_y.nunique() < 2:
2479
- raise ValidationError(bundle.get("y_is_constant"))
2489
+ raise ValidationError(self.bundle.get("y_is_constant"))
2480
2490
 
2481
2491
  return validated_y
2482
2492
 
2483
2493
  def _validate_eval_set_pair(self, X: pd.DataFrame, eval_pair: Tuple) -> Tuple[pd.DataFrame, pd.Series]:
2484
2494
  if len(eval_pair) != 2:
2485
- raise ValidationError(bundle.get("eval_set_invalid_tuple_size").format(len(eval_pair)))
2495
+ raise ValidationError(self.bundle.get("eval_set_invalid_tuple_size").format(len(eval_pair)))
2486
2496
  eval_X = eval_pair[0]
2487
2497
  eval_y = eval_pair[1]
2488
2498
 
2489
2499
  if _num_samples(eval_X) == 0:
2490
- raise ValidationError(bundle.get("eval_x_is_empty"))
2500
+ raise ValidationError(self.bundle.get("eval_x_is_empty"))
2491
2501
  if _num_samples(eval_y) == 0:
2492
- raise ValidationError(bundle.get("eval_y_is_empty"))
2502
+ raise ValidationError(self.bundle.get("eval_y_is_empty"))
2493
2503
 
2494
2504
  if isinstance(eval_X, pd.DataFrame):
2495
2505
  if isinstance(eval_X.columns, pd.MultiIndex) or isinstance(eval_X.index, pd.MultiIndex):
2496
- raise ValidationError(bundle.get("eval_x_multiindex_unsupported"))
2506
+ raise ValidationError(self.bundle.get("eval_x_multiindex_unsupported"))
2497
2507
  validated_eval_X = eval_X.copy()
2498
2508
  elif isinstance(eval_X, pd.Series):
2499
2509
  validated_eval_X = eval_X.to_frame()
@@ -2502,10 +2512,10 @@ class FeaturesEnricher(TransformerMixin):
2502
2512
  renaming = {c: str(c) for c in validated_eval_X.columns}
2503
2513
  validated_eval_X = validated_eval_X.rename(columns=renaming)
2504
2514
  else:
2505
- raise ValidationError(bundle.get("unsupported_x_type_eval_set").format(type(eval_X)))
2515
+ raise ValidationError(self.bundle.get("unsupported_x_type_eval_set").format(type(eval_X)))
2506
2516
 
2507
2517
  if not validated_eval_X.index.is_unique:
2508
- raise ValidationError(bundle.get("x_non_unique_index_eval_set"))
2518
+ raise ValidationError(self.bundle.get("x_non_unique_index_eval_set"))
2509
2519
 
2510
2520
  if self.exclude_columns is not None:
2511
2521
  validated_eval_X = validated_eval_X.drop(columns=self.exclude_columns, errors="ignore")
@@ -2519,28 +2529,30 @@ class FeaturesEnricher(TransformerMixin):
2519
2529
  if set(validated_eval_X.columns.to_list()) == set(X.columns.to_list()):
2520
2530
  validated_eval_X = validated_eval_X[X.columns.to_list()]
2521
2531
  else:
2522
- raise ValidationError(bundle.get("eval_x_and_x_diff_shape"))
2532
+ raise ValidationError(self.bundle.get("eval_x_and_x_diff_shape"))
2523
2533
 
2524
2534
  if _num_samples(validated_eval_X) != _num_samples(eval_y):
2525
2535
  raise ValidationError(
2526
- bundle.get("x_and_y_diff_size_eval_set").format(_num_samples(validated_eval_X), _num_samples(eval_y))
2536
+ self.bundle.get("x_and_y_diff_size_eval_set").format(
2537
+ _num_samples(validated_eval_X), _num_samples(eval_y)
2538
+ )
2527
2539
  )
2528
2540
 
2529
2541
  if isinstance(eval_y, pd.DataFrame):
2530
2542
  if len(eval_y.columns) != 1:
2531
- raise ValidationError(bundle.get("y_invalid_dimension_dataframe_eval_set"))
2543
+ raise ValidationError(self.bundle.get("y_invalid_dimension_dataframe_eval_set"))
2532
2544
  if isinstance(eval_y.columns, pd.MultiIndex) or isinstance(eval_y.index, pd.MultiIndex):
2533
- raise ValidationError(bundle.get("eval_y_multiindex_unsupported"))
2545
+ raise ValidationError(self.bundle.get("eval_y_multiindex_unsupported"))
2534
2546
  eval_y = eval_y[eval_y.columns[0]]
2535
2547
 
2536
2548
  if isinstance(eval_y, pd.Series):
2537
2549
  if (eval_y.index != validated_eval_X.index).any():
2538
- raise ValidationError(bundle.get("x_and_y_diff_index_eval_set"))
2550
+ raise ValidationError(self.bundle.get("x_and_y_diff_index_eval_set"))
2539
2551
  validated_eval_y = eval_y.copy()
2540
2552
  validated_eval_y.rename(TARGET, inplace=True)
2541
2553
  elif isinstance(eval_y, np.ndarray):
2542
2554
  if eval_y.ndim != 1:
2543
- raise ValidationError(bundle.get("y_invalid_dimension_array_eval_set"))
2555
+ raise ValidationError(self.bundle.get("y_invalid_dimension_array_eval_set"))
2544
2556
  Xy = validated_eval_X.copy()
2545
2557
  Xy[TARGET] = eval_y
2546
2558
  validated_eval_y = Xy[TARGET].copy()
@@ -2549,27 +2561,29 @@ class FeaturesEnricher(TransformerMixin):
2549
2561
  Xy[TARGET] = eval_y
2550
2562
  validated_eval_y = Xy[TARGET].copy()
2551
2563
  else:
2552
- raise ValidationError(bundle.get("unsupported_y_type_eval_set").format(type(eval_y)))
2564
+ raise ValidationError(self.bundle.get("unsupported_y_type_eval_set").format(type(eval_y)))
2553
2565
 
2554
2566
  if validated_eval_y.nunique() < 2:
2555
- raise ValidationError(bundle.get("y_is_constant_eval_set"))
2567
+ raise ValidationError(self.bundle.get("y_is_constant_eval_set"))
2556
2568
 
2557
2569
  return validated_eval_X, validated_eval_y
2558
2570
 
2559
2571
  def _validate_baseline_score(self, X: pd.DataFrame, eval_set: Optional[List[Tuple]]):
2560
2572
  if self.baseline_score_column is not None:
2561
2573
  if self.baseline_score_column not in X.columns:
2562
- raise ValidationError(bundle.get("baseline_score_column_not_exists").format(self.baseline_score_column))
2574
+ raise ValidationError(
2575
+ self.bundle.get("baseline_score_column_not_exists").format(self.baseline_score_column)
2576
+ )
2563
2577
  if X[self.baseline_score_column].isna().any():
2564
- raise ValidationError(bundle.get("baseline_score_column_has_na"))
2578
+ raise ValidationError(self.bundle.get("baseline_score_column_has_na"))
2565
2579
  if eval_set is not None:
2566
2580
  if isinstance(eval_set, tuple):
2567
2581
  eval_set = [eval_set]
2568
2582
  for eval in eval_set:
2569
2583
  if self.baseline_score_column not in eval[0].columns:
2570
- raise ValidationError(bundle.get("baseline_score_column_not_exists"))
2584
+ raise ValidationError(self.bundle.get("baseline_score_column_not_exists"))
2571
2585
  if eval[0][self.baseline_score_column].isna().any():
2572
- raise ValidationError(bundle.get("baseline_score_column_has_na"))
2586
+ raise ValidationError(self.bundle.get("baseline_score_column_has_na"))
2573
2587
 
2574
2588
  @staticmethod
2575
2589
  def _sample_X_and_y(X: pd.DataFrame, y: pd.Series, enriched_X: pd.DataFrame) -> Tuple[pd.DataFrame, pd.Series]:
@@ -2853,7 +2867,7 @@ class FeaturesEnricher(TransformerMixin):
2853
2867
  ) -> Tuple[pd.DataFrame, Dict[int, pd.DataFrame]]:
2854
2868
  if result_features is None:
2855
2869
  self.logger.error(f"result features not found by search_task_id: {self.get_search_id()}")
2856
- raise RuntimeError(bundle.get("features_wasnt_returned"))
2870
+ raise RuntimeError(self.bundle.get("features_wasnt_returned"))
2857
2871
  result_features = (
2858
2872
  result_features.drop(columns=EVAL_SET_INDEX)
2859
2873
  if EVAL_SET_INDEX in result_features.columns
@@ -2864,7 +2878,7 @@ class FeaturesEnricher(TransformerMixin):
2864
2878
  dup_features = [c for c in comparing_columns if c in result_features.columns and c != SYSTEM_RECORD_ID]
2865
2879
  if len(dup_features) > 0:
2866
2880
  self.logger.warning(f"X contain columns with same name as returned from backend: {dup_features}")
2867
- raise ValidationError(bundle.get("returned_features_same_as_passed").format(dup_features))
2881
+ raise ValidationError(self.bundle.get("returned_features_same_as_passed").format(dup_features))
2868
2882
 
2869
2883
  # index overrites from result_features
2870
2884
  original_index_name = df_with_original_index.index.name
@@ -2924,10 +2938,10 @@ class FeaturesEnricher(TransformerMixin):
2924
2938
 
2925
2939
  def __prepare_feature_importances(self, trace_id: str, x_columns: List[str]):
2926
2940
  if self._search_task is None:
2927
- raise NotFittedError(bundle.get("transform_unfitted_enricher"))
2941
+ raise NotFittedError(self.bundle.get("transform_unfitted_enricher"))
2928
2942
  features_meta = self._search_task.get_all_features_metadata_v2()
2929
2943
  if features_meta is None:
2930
- raise Exception(bundle.get("missing_features_meta"))
2944
+ raise Exception(self.bundle.get("missing_features_meta"))
2931
2945
 
2932
2946
  original_names_dict = {c.name: c.originalName for c in self._search_task.get_file_metadata(trace_id).columns}
2933
2947
  features_df = self._search_task.get_all_initial_raw_features(trace_id, metrics_calculation=True)
@@ -3017,38 +3031,38 @@ class FeaturesEnricher(TransformerMixin):
3017
3031
  )
3018
3032
  features_info.append(
3019
3033
  {
3020
- bundle.get("features_info_name"): feature_name,
3021
- bundle.get("features_info_shap"): round_shap_value(feature_meta.shap_value),
3022
- bundle.get("features_info_hitrate"): feature_meta.hit_rate,
3023
- bundle.get("features_info_value_preview"): feature_sample,
3024
- bundle.get("features_info_provider"): provider,
3025
- bundle.get("features_info_source"): source,
3026
- bundle.get("features_info_commercial_schema"): commercial_schema,
3034
+ self.bundle.get("features_info_name"): feature_name,
3035
+ self.bundle.get("features_info_shap"): round_shap_value(feature_meta.shap_value),
3036
+ self.bundle.get("features_info_hitrate"): feature_meta.hit_rate,
3037
+ self.bundle.get("features_info_value_preview"): feature_sample,
3038
+ self.bundle.get("features_info_provider"): provider,
3039
+ self.bundle.get("features_info_source"): source,
3040
+ self.bundle.get("features_info_commercial_schema"): commercial_schema,
3027
3041
  }
3028
3042
  )
3029
3043
  features_info_without_links.append(
3030
3044
  {
3031
- bundle.get("features_info_name"): internal_feature_name,
3032
- bundle.get("features_info_shap"): round_shap_value(feature_meta.shap_value),
3033
- bundle.get("features_info_hitrate"): feature_meta.hit_rate,
3034
- bundle.get("features_info_value_preview"): feature_sample,
3035
- bundle.get("features_info_provider"): internal_provider,
3036
- bundle.get("features_info_source"): internal_source,
3037
- bundle.get("features_info_commercial_schema"): commercial_schema,
3045
+ self.bundle.get("features_info_name"): internal_feature_name,
3046
+ self.bundle.get("features_info_shap"): round_shap_value(feature_meta.shap_value),
3047
+ self.bundle.get("features_info_hitrate"): feature_meta.hit_rate,
3048
+ self.bundle.get("features_info_value_preview"): feature_sample,
3049
+ self.bundle.get("features_info_provider"): internal_provider,
3050
+ self.bundle.get("features_info_source"): internal_source,
3051
+ self.bundle.get("features_info_commercial_schema"): commercial_schema,
3038
3052
  }
3039
3053
  )
3040
3054
  internal_features_info.append(
3041
3055
  {
3042
- bundle.get("features_info_name"): internal_feature_name,
3056
+ self.bundle.get("features_info_name"): internal_feature_name,
3043
3057
  "feature_link": feature_meta.doc_link,
3044
- bundle.get("features_info_shap"): round_shap_value(feature_meta.shap_value),
3045
- bundle.get("features_info_hitrate"): feature_meta.hit_rate,
3046
- bundle.get("features_info_value_preview"): feature_sample,
3047
- bundle.get("features_info_provider"): internal_provider,
3058
+ self.bundle.get("features_info_shap"): round_shap_value(feature_meta.shap_value),
3059
+ self.bundle.get("features_info_hitrate"): feature_meta.hit_rate,
3060
+ self.bundle.get("features_info_value_preview"): feature_sample,
3061
+ self.bundle.get("features_info_provider"): internal_provider,
3048
3062
  "provider_link": feature_meta.data_provider_link,
3049
- bundle.get("features_info_source"): internal_source,
3063
+ self.bundle.get("features_info_source"): internal_source,
3050
3064
  "source_link": feature_meta.data_source_link,
3051
- bundle.get("features_info_commercial_schema"): feature_meta.commercial_schema or "",
3065
+ self.bundle.get("features_info_commercial_schema"): feature_meta.commercial_schema or "",
3052
3066
  }
3053
3067
  )
3054
3068
 
@@ -3058,8 +3072,10 @@ class FeaturesEnricher(TransformerMixin):
3058
3072
  self._internal_features_info = pd.DataFrame(internal_features_info)
3059
3073
  do_without_pandas_limits(lambda: self.logger.info(f"Features info:\n{self._internal_features_info}"))
3060
3074
 
3061
- self.relevant_data_sources = self._group_relevant_data_sources(self.features_info)
3062
- self._relevant_data_sources_wo_links = self._group_relevant_data_sources(self._features_info_without_links)
3075
+ self.relevant_data_sources = self._group_relevant_data_sources(self.features_info, self.bundle)
3076
+ self._relevant_data_sources_wo_links = self._group_relevant_data_sources(
3077
+ self._features_info_without_links, self.bundle
3078
+ )
3063
3079
  do_without_pandas_limits(
3064
3080
  lambda: self.logger.info(f"Relevant data sources:\n{self._relevant_data_sources_wo_links}")
3065
3081
  )
@@ -3119,7 +3135,7 @@ class FeaturesEnricher(TransformerMixin):
3119
3135
  return None
3120
3136
 
3121
3137
  @staticmethod
3122
- def _group_relevant_data_sources(df: pd.DataFrame) -> pd.DataFrame:
3138
+ def _group_relevant_data_sources(df: pd.DataFrame, bundle: ResourceBundle) -> pd.DataFrame:
3123
3139
  return (
3124
3140
  df.query(f"{bundle.get('features_info_provider')} != ''")
3125
3141
  .groupby([bundle.get("features_info_provider"), bundle.get("features_info_source")])
@@ -3174,31 +3190,31 @@ class FeaturesEnricher(TransformerMixin):
3174
3190
  }
3175
3191
  passed_unsupported_search_keys = unsupported_search_keys.intersection(search_keys.values())
3176
3192
  if len(passed_unsupported_search_keys) > 0:
3177
- raise ValidationError(bundle.get("unsupported_search_key").format(passed_unsupported_search_keys))
3193
+ raise ValidationError(self.bundle.get("unsupported_search_key").format(passed_unsupported_search_keys))
3178
3194
 
3179
3195
  for column_id, meaning_type in search_keys.items():
3180
3196
  column_name = None
3181
3197
  if isinstance(column_id, str):
3182
3198
  if column_id not in x.columns:
3183
- raise ValidationError(bundle.get("search_key_not_found").format(column_id, list(x.columns)))
3199
+ raise ValidationError(self.bundle.get("search_key_not_found").format(column_id, list(x.columns)))
3184
3200
  column_name = column_id
3185
3201
  valid_search_keys[column_name] = meaning_type
3186
3202
  elif isinstance(column_id, int):
3187
3203
  if column_id >= x.shape[1]:
3188
- raise ValidationError(bundle.get("numeric_search_key_not_found").format(column_id, x.shape[1]))
3204
+ raise ValidationError(self.bundle.get("numeric_search_key_not_found").format(column_id, x.shape[1]))
3189
3205
  column_name = x.columns[column_id]
3190
3206
  valid_search_keys[column_name] = meaning_type
3191
3207
  else:
3192
- raise ValidationError(bundle.get("unsupported_search_key_type").format(type(column_id)))
3208
+ raise ValidationError(self.bundle.get("unsupported_search_key_type").format(type(column_id)))
3193
3209
 
3194
3210
  if meaning_type == SearchKey.COUNTRY and self.country_code is not None:
3195
- msg = bundle.get("search_key_country_and_country_code")
3211
+ msg = self.bundle.get("search_key_country_and_country_code")
3196
3212
  self.logger.warning(msg)
3197
3213
  print(msg)
3198
3214
  self.country_code = None
3199
3215
 
3200
3216
  if not self.__is_registered and not is_demo_dataset and meaning_type in SearchKey.personal_keys():
3201
- msg = bundle.get("unregistered_with_personal_keys").format(meaning_type)
3217
+ msg = self.bundle.get("unregistered_with_personal_keys").format(meaning_type)
3202
3218
  self.logger.warning(msg)
3203
3219
  if not silent_mode:
3204
3220
  self.warning_counter.increment()
@@ -3209,7 +3225,7 @@ class FeaturesEnricher(TransformerMixin):
3209
3225
  if x[column_name].isnull().all() or (
3210
3226
  is_string_dtype(x[column_name]) and (x[column_name].astype("string").str.strip() == "").all()
3211
3227
  ):
3212
- raise ValidationError(bundle.get("empty_search_key").format(column_name))
3228
+ raise ValidationError(self.bundle.get("empty_search_key").format(column_name))
3213
3229
 
3214
3230
  if self.detect_missing_search_keys and (
3215
3231
  not is_transform or set(valid_search_keys.values()) != set(self.fit_search_keys.values())
@@ -3219,7 +3235,7 @@ class FeaturesEnricher(TransformerMixin):
3219
3235
  )
3220
3236
 
3221
3237
  if all(k == SearchKey.CUSTOM_KEY for k in valid_search_keys.values()):
3222
- msg = bundle.get("unregistered_only_personal_keys")
3238
+ msg = self.bundle.get("unregistered_only_personal_keys")
3223
3239
  self.logger.warning(msg + f" Provided search keys: {search_keys}")
3224
3240
  raise ValidationError(msg)
3225
3241
 
@@ -3234,7 +3250,7 @@ class FeaturesEnricher(TransformerMixin):
3234
3250
  and next(iter(valid_search_keys.values())) == SearchKey.DATE
3235
3251
  and not silent_mode
3236
3252
  ):
3237
- msg = bundle.get("date_only_search")
3253
+ msg = self.bundle.get("date_only_search")
3238
3254
  print(msg)
3239
3255
  self.logger.warning(msg)
3240
3256
  self.warning_counter.increment()
@@ -3243,7 +3259,7 @@ class FeaturesEnricher(TransformerMixin):
3243
3259
  if (self.cv is None or self.cv == CVType.k_fold) and len(maybe_date) > 0 and not silent_mode:
3244
3260
  date_column = next(iter(maybe_date))
3245
3261
  if x[date_column].nunique() > 0.9 * _num_samples(x):
3246
- msg = bundle.get("date_search_without_time_series")
3262
+ msg = self.bundle.get("date_search_without_time_series")
3247
3263
  print(msg)
3248
3264
  self.logger.warning(msg)
3249
3265
  self.warning_counter.increment()
@@ -3252,7 +3268,7 @@ class FeaturesEnricher(TransformerMixin):
3252
3268
  for k, v in valid_search_keys.items():
3253
3269
  # Show warning for country only if country is the only key
3254
3270
  if x[k].nunique() == 1 and (v != SearchKey.COUNTRY or len(valid_search_keys) == 1):
3255
- msg = bundle.get("single_constant_search_key").format(v, x[k].values[0])
3271
+ msg = self.bundle.get("single_constant_search_key").format(v, x[k].values[0])
3256
3272
  print(msg)
3257
3273
  self.logger.warning(msg)
3258
3274
  self.warning_counter.increment()
@@ -3284,11 +3300,11 @@ class FeaturesEnricher(TransformerMixin):
3284
3300
  progress_callback=progress_callback,
3285
3301
  )
3286
3302
  if self.metrics is not None:
3287
- msg = bundle.get("quality_metrics_header")
3303
+ msg = self.bundle.get("quality_metrics_header")
3288
3304
  display_html_dataframe(self.metrics, self.metrics, msg)
3289
3305
 
3290
3306
  def __show_selected_features(self, search_keys: Dict[str, SearchKey]):
3291
- msg = bundle.get("features_info_header").format(len(self.feature_names_), list(search_keys.keys()))
3307
+ msg = self.bundle.get("features_info_header").format(len(self.feature_names_), list(search_keys.keys()))
3292
3308
 
3293
3309
  try:
3294
3310
  _ = get_ipython() # type: ignore
@@ -3297,16 +3313,16 @@ class FeaturesEnricher(TransformerMixin):
3297
3313
  self.logger.info(msg)
3298
3314
  if len(self.feature_names_) > 0:
3299
3315
  display_html_dataframe(
3300
- self.features_info, self._features_info_without_links, bundle.get("relevant_features_header")
3316
+ self.features_info, self._features_info_without_links, self.bundle.get("relevant_features_header")
3301
3317
  )
3302
3318
 
3303
3319
  display_html_dataframe(
3304
3320
  self.relevant_data_sources,
3305
3321
  self._relevant_data_sources_wo_links,
3306
- bundle.get("relevant_data_sources_header"),
3322
+ self.bundle.get("relevant_data_sources_header"),
3307
3323
  )
3308
3324
  else:
3309
- msg = bundle.get("features_info_zero_important_features")
3325
+ msg = self.bundle.get("features_info_zero_important_features")
3310
3326
  self.logger.warning(msg)
3311
3327
  self.__display_support_link(msg)
3312
3328
  self.warning_counter.increment()
@@ -3333,14 +3349,14 @@ class FeaturesEnricher(TransformerMixin):
3333
3349
  return float(importance_threshold) if importance_threshold is not None else 0.0
3334
3350
  except ValueError:
3335
3351
  self.logger.exception(f"Invalid importance_threshold provided: {importance_threshold}")
3336
- raise ValidationError(bundle.get("invalid_importance_threshold"))
3352
+ raise ValidationError(self.bundle.get("invalid_importance_threshold"))
3337
3353
 
3338
3354
  def __validate_max_features(self, max_features: Optional[int]) -> int:
3339
3355
  try:
3340
3356
  return int(max_features) if max_features is not None else 400
3341
3357
  except ValueError:
3342
3358
  self.logger.exception(f"Invalid max_features provided: {max_features}")
3343
- raise ValidationError(bundle.get("invalid_max_features"))
3359
+ raise ValidationError(self.bundle.get("invalid_max_features"))
3344
3360
 
3345
3361
  def __filtered_enriched_features(
3346
3362
  self,
@@ -3372,7 +3388,7 @@ class FeaturesEnricher(TransformerMixin):
3372
3388
  self.autodetected_search_keys[maybe_key] = SearchKey.POSTAL_CODE
3373
3389
  self.logger.info(f"Autodetected search key POSTAL_CODE in column {maybe_key}")
3374
3390
  if not silent_mode:
3375
- print(bundle.get("postal_code_detected").format(maybe_key))
3391
+ print(self.bundle.get("postal_code_detected").format(maybe_key))
3376
3392
 
3377
3393
  if (
3378
3394
  SearchKey.COUNTRY not in search_keys.values()
@@ -3385,7 +3401,7 @@ class FeaturesEnricher(TransformerMixin):
3385
3401
  self.autodetected_search_keys[maybe_key] = SearchKey.COUNTRY
3386
3402
  self.logger.info(f"Autodetected search key COUNTRY in column {maybe_key}")
3387
3403
  if not silent_mode:
3388
- print(bundle.get("country_detected").format(maybe_key))
3404
+ print(self.bundle.get("country_detected").format(maybe_key))
3389
3405
 
3390
3406
  if (
3391
3407
  SearchKey.EMAIL not in search_keys.values()
@@ -3399,13 +3415,13 @@ class FeaturesEnricher(TransformerMixin):
3399
3415
  self.autodetected_search_keys[maybe_key] = SearchKey.EMAIL
3400
3416
  self.logger.info(f"Autodetected search key EMAIL in column {maybe_key}")
3401
3417
  if not silent_mode:
3402
- print(bundle.get("email_detected").format(maybe_key))
3418
+ print(self.bundle.get("email_detected").format(maybe_key))
3403
3419
  else:
3404
3420
  self.logger.warning(
3405
3421
  f"Autodetected search key EMAIL in column {maybe_key}. But not used because not registered user"
3406
3422
  )
3407
3423
  if not silent_mode:
3408
- print(bundle.get("email_detected_not_registered").format(maybe_key))
3424
+ print(self.bundle.get("email_detected_not_registered").format(maybe_key))
3409
3425
  self.warning_counter.increment()
3410
3426
 
3411
3427
  if SearchKey.PHONE not in search_keys.values() and check_need_detect(SearchKey.PHONE):
@@ -3416,20 +3432,20 @@ class FeaturesEnricher(TransformerMixin):
3416
3432
  self.autodetected_search_keys[maybe_key] = SearchKey.PHONE
3417
3433
  self.logger.info(f"Autodetected search key PHONE in column {maybe_key}")
3418
3434
  if not silent_mode:
3419
- print(bundle.get("phone_detected").format(maybe_key))
3435
+ print(self.bundle.get("phone_detected").format(maybe_key))
3420
3436
  else:
3421
3437
  self.logger.warning(
3422
3438
  f"Autodetected search key PHONE in column {maybe_key}. But not used because not registered user"
3423
3439
  )
3424
3440
  if not silent_mode:
3425
- print(bundle.get("phone_detected_not_registered"))
3441
+ print(self.bundle.get("phone_detected_not_registered"))
3426
3442
  self.warning_counter.increment()
3427
3443
 
3428
3444
  return search_keys
3429
3445
 
3430
3446
  def _validate_binary_observations(self, y, task_type: ModelTaskType):
3431
3447
  if task_type == ModelTaskType.BINARY and (y.value_counts() < 1000).any():
3432
- msg = bundle.get("binary_small_dataset")
3448
+ msg = self.bundle.get("binary_small_dataset")
3433
3449
  self.logger.warning(msg)
3434
3450
  print(msg)
3435
3451
 
@@ -3444,8 +3460,8 @@ class FeaturesEnricher(TransformerMixin):
3444
3460
  self.logger.exception("Failed to dump python libs")
3445
3461
 
3446
3462
  def __display_support_link(self, link_text: Optional[str] = None):
3447
- support_link = bundle.get("support_link")
3448
- link_text = link_text or bundle.get("support_text")
3463
+ support_link = self.bundle.get("support_link")
3464
+ link_text = link_text or self.bundle.get("support_text")
3449
3465
  try:
3450
3466
  from IPython.display import HTML, display
3451
3467
 
@@ -3561,7 +3577,7 @@ def _num_samples(x):
3561
3577
  raise TypeError(message) from type_error
3562
3578
 
3563
3579
 
3564
- def is_frames_equal(first, second) -> bool:
3580
+ def is_frames_equal(first, second, bundle: ResourceBundle) -> bool:
3565
3581
  if (isinstance(first, pd.DataFrame) and isinstance(second, pd.DataFrame)) or (
3566
3582
  isinstance(first, pd.Series) and isinstance(second, pd.Series)
3567
3583
  ):