upgini 1.1.244a25__py3-none-any.whl → 1.1.245a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

@@ -50,7 +50,7 @@ from upgini.metadata import (
50
50
  SearchKey,
51
51
  )
52
52
  from upgini.metrics import EstimatorWrapper, validate_scoring_argument
53
- from upgini.resource_bundle import bundle
53
+ from upgini.resource_bundle import ResourceBundle, get_custom_bundle, bundle
54
54
  from upgini.search_task import SearchTask
55
55
  from upgini.spinner import Spinner
56
56
  from upgini.utils import combine_search_keys
@@ -186,8 +186,10 @@ class FeaturesEnricher(TransformerMixin):
186
186
  baseline_score_column: Optional[Any] = None,
187
187
  client_ip: Optional[str] = None,
188
188
  client_visitorid: Optional[str] = None,
189
+ custom_bundle_config: Optional[str] = None,
189
190
  **kwargs,
190
191
  ):
192
+ self.bundle = get_custom_bundle(custom_bundle_config)
191
193
  self._api_key = api_key or os.environ.get(UPGINI_API_KEY)
192
194
  if api_key is not None and not isinstance(api_key, str):
193
195
  raise ValidationError(f"api_key should be `string`, but passed: `{api_key}`")
@@ -240,23 +242,23 @@ class FeaturesEnricher(TransformerMixin):
240
242
  if search_id:
241
243
  search_task = SearchTask(search_id, rest_client=self.rest_client, logger=self.logger)
242
244
 
243
- print(bundle.get("search_by_task_id_start"))
245
+ print(self.bundle.get("search_by_task_id_start"))
244
246
  trace_id = str(uuid.uuid4())
245
247
  with MDC(trace_id=trace_id):
246
248
  try:
247
- self.logger.info(f"FeaturesEnricher created from existing search: {search_id}")
249
+ self.logger.debug(f"FeaturesEnricher created from existing search: {search_id}")
248
250
  self._search_task = search_task.poll_result(trace_id, quiet=True, check_fit=True)
249
251
  file_metadata = self._search_task.get_file_metadata(trace_id)
250
252
  x_columns = [c.originalName or c.name for c in file_metadata.columns]
251
253
  self.__prepare_feature_importances(trace_id, x_columns)
252
254
  # TODO validate search_keys with search_keys from file_metadata
253
- print(bundle.get("search_by_task_id_finish"))
254
- self.logger.info(f"Successfully initialized with search_id: {search_id}")
255
+ print(self.bundle.get("search_by_task_id_finish"))
256
+ self.logger.debug(f"Successfully initialized with search_id: {search_id}")
255
257
  except HttpError as e:
256
258
  if "Interrupted by client" in e.args[0]:
257
259
  raise ValidationError("Search was cancelled")
258
260
  except Exception as e:
259
- print(bundle.get("failed_search_by_task_id"))
261
+ print(self.bundle.get("failed_search_by_task_id"))
260
262
  self.logger.exception(f"Failed to find search_id: {search_id}")
261
263
  raise e
262
264
 
@@ -277,13 +279,13 @@ class FeaturesEnricher(TransformerMixin):
277
279
  self.round_embeddings = round_embeddings
278
280
  if generate_features is not None:
279
281
  if len(generate_features) > self.GENERATE_FEATURES_LIMIT:
280
- msg = bundle.get("too_many_generate_features").format(self.GENERATE_FEATURES_LIMIT)
282
+ msg = self.bundle.get("too_many_generate_features").format(self.GENERATE_FEATURES_LIMIT)
281
283
  self.logger.error(msg)
282
284
  raise ValidationError(msg)
283
285
  self.runtime_parameters.properties["generate_features"] = ",".join(generate_features)
284
286
  if round_embeddings is not None:
285
287
  if not isinstance(round_embeddings, int) or round_embeddings < 0:
286
- msg = bundle.get("invalid_round_embeddings")
288
+ msg = self.bundle.get("invalid_round_embeddings")
287
289
  self.logger.error(msg)
288
290
  raise ValidationError(msg)
289
291
  self.runtime_parameters.properties["round_embeddings"] = round_embeddings
@@ -309,7 +311,7 @@ class FeaturesEnricher(TransformerMixin):
309
311
  api_key = property(_get_api_key, _set_api_key)
310
312
 
311
313
  @staticmethod
312
- def _check_eval_set(eval_set, X):
314
+ def _check_eval_set(eval_set, X, bundle: ResourceBundle):
313
315
  checked_eval_set = []
314
316
  if eval_set is not None and isinstance(eval_set, tuple):
315
317
  eval_set = [eval_set]
@@ -318,7 +320,7 @@ class FeaturesEnricher(TransformerMixin):
318
320
  for eval_pair in eval_set or []:
319
321
  if not isinstance(eval_pair, tuple) or len(eval_pair) != 2:
320
322
  raise ValidationError(bundle.get("eval_set_invalid_tuple_size").format(len(eval_pair)))
321
- if not is_frames_equal(X, eval_pair[0]):
323
+ if not is_frames_equal(X, eval_pair[0], bundle):
322
324
  checked_eval_set.append(eval_pair)
323
325
  return checked_eval_set
324
326
 
@@ -401,7 +403,7 @@ class FeaturesEnricher(TransformerMixin):
401
403
  try:
402
404
  self.X = X
403
405
  self.y = y
404
- self.eval_set = self._check_eval_set(eval_set, X)
406
+ self.eval_set = self._check_eval_set(eval_set, X, self.bundle)
405
407
  self.dump_input(trace_id, X, y, eval_set)
406
408
  self.__inner_fit(
407
409
  trace_id,
@@ -439,7 +441,7 @@ class FeaturesEnricher(TransformerMixin):
439
441
  if len(e.args) > 0 and (
440
442
  "File doesn't intersect with any ADS" in str(e.args[0]) or "Empty intersection" in str(e.args[0])
441
443
  ):
442
- self.__display_support_link(bundle.get("features_info_zero_important_features"))
444
+ self.__display_support_link(self.bundle.get("features_info_zero_important_features"))
443
445
  elif isinstance(e, ValidationError):
444
446
  self._dump_python_libs()
445
447
  self._show_error(str(e))
@@ -540,11 +542,13 @@ class FeaturesEnricher(TransformerMixin):
540
542
  try:
541
543
  self.X = X
542
544
  self.y = y
543
- self.eval_set = self._check_eval_set(eval_set, X)
545
+ self.eval_set = self._check_eval_set(eval_set, X, self.bundle)
544
546
  self.dump_input(trace_id, X, y, eval_set)
545
547
 
546
548
  if _num_samples(drop_duplicates(X)) > Dataset.MAX_ROWS:
547
- raise ValidationError(bundle.get("dataset_too_many_rows_registered").format(Dataset.MAX_ROWS))
549
+ raise ValidationError(
550
+ self.bundle.get("dataset_too_many_rows_registered").format(Dataset.MAX_ROWS)
551
+ )
548
552
 
549
553
  self.__inner_fit(
550
554
  trace_id,
@@ -581,7 +585,7 @@ class FeaturesEnricher(TransformerMixin):
581
585
  if len(e.args) > 0 and (
582
586
  "File doesn't intersect with any ADS" in str(e.args[0]) or "Empty intersection" in str(e.args[0])
583
587
  ):
584
- self.__display_support_link(bundle.get("features_info_zero_important_features"))
588
+ self.__display_support_link(self.bundle.get("features_info_zero_important_features"))
585
589
  return None
586
590
  elif isinstance(e, ValidationError):
587
591
  self._dump_python_libs()
@@ -677,11 +681,11 @@ class FeaturesEnricher(TransformerMixin):
677
681
  self.__validate_search_keys(self.search_keys, self.search_id)
678
682
  try:
679
683
  if len(self.feature_names_) == 0:
680
- self.logger.warning(bundle.get("no_important_features_for_transform"))
684
+ self.logger.warning(self.bundle.get("no_important_features_for_transform"))
681
685
  return X
682
686
 
683
687
  if self._has_paid_features(exclude_features_sources):
684
- msg = bundle.get("transform_with_paid_features")
688
+ msg = self.bundle.get("transform_with_paid_features")
685
689
  self.logger.warning(msg)
686
690
  self.__display_support_link(msg)
687
691
  return None
@@ -691,13 +695,15 @@ class FeaturesEnricher(TransformerMixin):
691
695
  self.logger.info(f"Current transform usage: {transform_usage}. Transforming {len(X)} rows")
692
696
  if transform_usage.has_limit:
693
697
  if len(X) > transform_usage.rest_rows:
694
- msg = bundle.get("transform_usage_warning").format(len(X), transform_usage.rest_rows)
698
+ msg = self.bundle.get("transform_usage_warning").format(
699
+ len(X), transform_usage.rest_rows
700
+ )
695
701
  self.logger.warning(msg)
696
702
  print(msg)
697
703
  show_request_quote_button()
698
704
  return None
699
705
  else:
700
- msg = bundle.get("transform_usage_info").format(
706
+ msg = self.bundle.get("transform_usage_info").format(
701
707
  transform_usage.limit, transform_usage.transformed_rows
702
708
  )
703
709
  self.logger.info("transform_usage_warning")
@@ -735,13 +741,13 @@ class FeaturesEnricher(TransformerMixin):
735
741
  if len(e.args) > 0 and (
736
742
  "File doesn't intersect with any ADS" in str(e.args[0]) or "Empty intersection" in str(e.args[0])
737
743
  ):
738
- self.__display_support_link(bundle.get("features_info_zero_important_features"))
744
+ self.__display_support_link(self.bundle.get("features_info_zero_important_features"))
739
745
  return None
740
746
  elif len(e.args) > 0 and (
741
747
  "You have reached the quota limit of trial data usage" in str(e.args[0])
742
748
  or "Current user hasn't access to trial features" in str(e.args[0])
743
749
  ):
744
- self.__display_support_link(bundle.get("trial_quota_limit_riched"))
750
+ self.__display_support_link(self.bundle.get("trial_quota_limit_riched"))
745
751
  return None
746
752
  elif isinstance(e, ValidationError):
747
753
  self._dump_python_libs()
@@ -858,7 +864,7 @@ class FeaturesEnricher(TransformerMixin):
858
864
  or (self.X is None and X is None)
859
865
  or (self.y is None and y is None)
860
866
  ):
861
- raise ValidationError(bundle.get("metrics_unfitted_enricher"))
867
+ raise ValidationError(self.bundle.get("metrics_unfitted_enricher"))
862
868
 
863
869
  if X is not None and y is None:
864
870
  raise ValidationError("X passed without y")
@@ -866,18 +872,12 @@ class FeaturesEnricher(TransformerMixin):
866
872
  effective_X = X if X is not None else self.X
867
873
  effective_eval_set = eval_set if eval_set is not None else self.eval_set
868
874
 
869
- effective_X = X if X is not None else self.X
870
- effective_eval_set = eval_set if eval_set is not None else self.eval_set
871
-
872
- effective_X = X if X is not None else self.X
873
- effective_eval_set = eval_set if eval_set is not None else self.eval_set
874
-
875
875
  validate_scoring_argument(scoring)
876
876
 
877
877
  self._validate_baseline_score(effective_X, effective_eval_set)
878
878
 
879
879
  if self._has_paid_features(exclude_features_sources):
880
- msg = bundle.get("metrics_with_paid_features")
880
+ msg = self.bundle.get("metrics_with_paid_features")
881
881
  self.logger.warning(msg)
882
882
  self.__display_support_link(msg)
883
883
  return None
@@ -898,7 +898,7 @@ class FeaturesEnricher(TransformerMixin):
898
898
  if self.search_keys[cat_feature] in [SearchKey.COUNTRY, SearchKey.POSTAL_CODE]:
899
899
  search_keys_for_metrics.append(cat_feature)
900
900
  else:
901
- raise ValidationError(bundle.get("cat_feature_search_key").format(cat_feature))
901
+ raise ValidationError(self.bundle.get("cat_feature_search_key").format(cat_feature))
902
902
 
903
903
  prepared_data = self._prepare_data_for_metrics(
904
904
  trace_id=trace_id,
@@ -928,10 +928,10 @@ class FeaturesEnricher(TransformerMixin):
928
928
 
929
929
  gc.collect()
930
930
 
931
- print(bundle.get("metrics_start"))
931
+ print(self.bundle.get("metrics_start"))
932
932
  with Spinner():
933
933
  if fitting_X.shape[1] == 0 and fitting_enriched_X.shape[1] == 0:
934
- print(bundle.get("metrics_no_important_free_features"))
934
+ print(self.bundle.get("metrics_no_important_free_features"))
935
935
  self.logger.warning("No client or free relevant ADS features found to calculate metrics")
936
936
  self.warning_counter.increment()
937
937
  return None
@@ -1025,20 +1025,25 @@ class FeaturesEnricher(TransformerMixin):
1025
1025
  effective_X = X if X is not None else self.X
1026
1026
  effective_y = y if y is not None else self.y
1027
1027
  train_metrics = {
1028
- bundle.get("quality_metrics_segment_header"): bundle.get("quality_metrics_train_segment"),
1029
- bundle.get("quality_metrics_rows_header"): _num_samples(effective_X),
1030
- # bundle.get("quality_metrics_match_rate_header"): self._search_task.initial_max_hit_rate_v2(),
1028
+ self.bundle.get("quality_metrics_segment_header"): self.bundle.get(
1029
+ "quality_metrics_train_segment"
1030
+ ),
1031
+ self.bundle.get("quality_metrics_rows_header"): _num_samples(effective_X),
1031
1032
  }
1032
1033
  if model_task_type in [ModelTaskType.BINARY, ModelTaskType.REGRESSION] and is_numeric_dtype(
1033
1034
  y_sorted
1034
1035
  ):
1035
- train_metrics[bundle.get("quality_metrics_mean_target_header")] = round(np.mean(effective_y), 4)
1036
+ train_metrics[self.bundle.get("quality_metrics_mean_target_header")] = round(
1037
+ np.mean(effective_y), 4
1038
+ )
1036
1039
  if etalon_metric is not None:
1037
- train_metrics[bundle.get("quality_metrics_baseline_header").format(metric)] = etalon_metric
1040
+ train_metrics[self.bundle.get("quality_metrics_baseline_header").format(metric)] = etalon_metric
1038
1041
  if enriched_metric is not None:
1039
- train_metrics[bundle.get("quality_metrics_enriched_header").format(metric)] = enriched_metric
1042
+ train_metrics[
1043
+ self.bundle.get("quality_metrics_enriched_header").format(metric)
1044
+ ] = enriched_metric
1040
1045
  if uplift is not None:
1041
- train_metrics[bundle.get("quality_metrics_uplift_header")] = uplift
1046
+ train_metrics[self.bundle.get("quality_metrics_uplift_header")] = uplift
1042
1047
  metrics = [train_metrics]
1043
1048
 
1044
1049
  # 3 If eval_set is presented - fit final model on train enriched data and score each
@@ -1090,40 +1095,42 @@ class FeaturesEnricher(TransformerMixin):
1090
1095
 
1091
1096
  effective_eval_set = eval_set if eval_set is not None else self.eval_set
1092
1097
  eval_metrics = {
1093
- bundle.get("quality_metrics_segment_header"): bundle.get(
1098
+ self.bundle.get("quality_metrics_segment_header"): self.bundle.get(
1094
1099
  "quality_metrics_eval_segment"
1095
1100
  ).format(idx + 1),
1096
- bundle.get("quality_metrics_rows_header"): _num_samples(effective_eval_set[idx][0]),
1097
- # bundle.get("quality_metrics_match_rate_header"): eval_hit_rate,
1101
+ self.bundle.get("quality_metrics_rows_header"): _num_samples(
1102
+ effective_eval_set[idx][0]
1103
+ ),
1104
+ # self.bundle.get("quality_metrics_match_rate_header"): eval_hit_rate,
1098
1105
  }
1099
1106
  if model_task_type in [ModelTaskType.BINARY, ModelTaskType.REGRESSION] and is_numeric_dtype(
1100
1107
  eval_y_sorted
1101
1108
  ):
1102
- eval_metrics[bundle.get("quality_metrics_mean_target_header")] = round(
1109
+ eval_metrics[self.bundle.get("quality_metrics_mean_target_header")] = round(
1103
1110
  np.mean(effective_eval_set[idx][1]), 4
1104
1111
  )
1105
1112
  if etalon_eval_metric is not None:
1106
1113
  eval_metrics[
1107
- bundle.get("quality_metrics_baseline_header").format(metric)
1114
+ self.bundle.get("quality_metrics_baseline_header").format(metric)
1108
1115
  ] = etalon_eval_metric
1109
1116
  if enriched_eval_metric is not None:
1110
1117
  eval_metrics[
1111
- bundle.get("quality_metrics_enriched_header").format(metric)
1118
+ self.bundle.get("quality_metrics_enriched_header").format(metric)
1112
1119
  ] = enriched_eval_metric
1113
1120
  if eval_uplift is not None:
1114
- eval_metrics[bundle.get("quality_metrics_uplift_header")] = eval_uplift
1121
+ eval_metrics[self.bundle.get("quality_metrics_uplift_header")] = eval_uplift
1115
1122
 
1116
1123
  metrics.append(eval_metrics)
1117
1124
 
1118
1125
  metrics_df = pd.DataFrame(metrics)
1119
- mean_target_hdr = bundle.get("quality_metrics_mean_target_header")
1126
+ mean_target_hdr = self.bundle.get("quality_metrics_mean_target_header")
1120
1127
  if mean_target_hdr in metrics_df.columns:
1121
1128
  metrics_df[mean_target_hdr] = metrics_df[mean_target_hdr].astype("float64")
1122
1129
  do_without_pandas_limits(
1123
1130
  lambda: self.logger.info(f"Metrics calculation finished successfully:\n{metrics_df}")
1124
1131
  )
1125
1132
 
1126
- uplift_col = bundle.get("quality_metrics_uplift_header")
1133
+ uplift_col = self.bundle.get("quality_metrics_uplift_header")
1127
1134
  date_column = self._get_date_column(search_keys)
1128
1135
  if (
1129
1136
  uplift_col in metrics_df.columns
@@ -1133,7 +1140,7 @@ class FeaturesEnricher(TransformerMixin):
1133
1140
  and date_column is not None
1134
1141
  and is_time_series(validated_X, date_column)
1135
1142
  ):
1136
- msg = bundle.get("metrics_negative_uplift_without_cv")
1143
+ msg = self.bundle.get("metrics_negative_uplift_without_cv")
1137
1144
  self.logger.warning(msg)
1138
1145
  self.__display_support_link(msg)
1139
1146
  elif uplift_col in metrics_df.columns and (metrics_df[uplift_col] < 0).any():
@@ -1149,7 +1156,7 @@ class FeaturesEnricher(TransformerMixin):
1149
1156
  "You have reached the quota limit of trial data usage" in str(e.args[0])
1150
1157
  or "Current user hasn't access to trial features" in str(e.args[0])
1151
1158
  ):
1152
- self.__display_support_link(bundle.get("trial_quota_limit_riched"))
1159
+ self.__display_support_link(self.bundle.get("trial_quota_limit_riched"))
1153
1160
  elif isinstance(e, ValidationError):
1154
1161
  self._dump_python_libs()
1155
1162
  self._show_error(str(e))
@@ -1171,7 +1178,7 @@ class FeaturesEnricher(TransformerMixin):
1171
1178
  if res[1] < 0.05:
1172
1179
  uneven_distribution = True
1173
1180
  if uneven_distribution:
1174
- msg = bundle.get("uneven_eval_target_distribution")
1181
+ msg = self.bundle.get("uneven_eval_target_distribution")
1175
1182
  print(msg)
1176
1183
  self.logger.warning(msg)
1177
1184
 
@@ -1185,14 +1192,14 @@ class FeaturesEnricher(TransformerMixin):
1185
1192
  ) -> List[str]:
1186
1193
  if exclude_features_sources:
1187
1194
  filtered_features_info = self.features_info[
1188
- ~self.features_info[bundle.get("features_info_name")].isin(exclude_features_sources)
1195
+ ~self.features_info[self.bundle.get("features_info_name")].isin(exclude_features_sources)
1189
1196
  ]
1190
1197
  else:
1191
1198
  filtered_features_info = self.features_info
1192
1199
  return list(
1193
1200
  filtered_features_info.loc[
1194
- filtered_features_info[bundle.get("features_info_commercial_schema")] == commercial_schema,
1195
- bundle.get("features_info_name"),
1201
+ filtered_features_info[self.bundle.get("features_info_commercial_schema")] == commercial_schema,
1202
+ self.bundle.get("features_info_name"),
1196
1203
  ].values
1197
1204
  )
1198
1205
 
@@ -1239,7 +1246,7 @@ class FeaturesEnricher(TransformerMixin):
1239
1246
  if X is None:
1240
1247
  return True, self.X, self.y, self.eval_set
1241
1248
 
1242
- checked_eval_set = self._check_eval_set(eval_set, X)
1249
+ checked_eval_set = self._check_eval_set(eval_set, X, self.bundle)
1243
1250
 
1244
1251
  if (
1245
1252
  X is self.X
@@ -1280,7 +1287,7 @@ class FeaturesEnricher(TransformerMixin):
1280
1287
  is_demo_dataset = hash_input(X, y, eval_set) in DEMO_DATASET_HASHES
1281
1288
  validated_X = self._validate_X(X)
1282
1289
  validated_y = self._validate_y(validated_X, y)
1283
- checked_eval_set = self._check_eval_set(eval_set, X)
1290
+ checked_eval_set = self._check_eval_set(eval_set, X, self.bundle)
1284
1291
  validated_eval_set = (
1285
1292
  [self._validate_eval_set_pair(validated_X, eval_set_pair) for eval_set_pair in checked_eval_set]
1286
1293
  if checked_eval_set
@@ -1409,7 +1416,7 @@ class FeaturesEnricher(TransformerMixin):
1409
1416
  return self.__sample_balanced(eval_set, trace_id, remove_outliers_calc_metrics)
1410
1417
  else:
1411
1418
  self.logger.info("Dataset is imbalanced or exclude_features_sources or X was passed. Run transform")
1412
- print(bundle.get("prepare_data_for_metrics"))
1419
+ print(self.bundle.get("prepare_data_for_metrics"))
1413
1420
  return self.__sample_imbalanced(
1414
1421
  validated_X,
1415
1422
  validated_y,
@@ -1503,7 +1510,7 @@ class FeaturesEnricher(TransformerMixin):
1503
1510
  not_msg = ""
1504
1511
  else:
1505
1512
  not_msg = "not "
1506
- msg = bundle.get("target_outliers_warning").format(len(target_outliers_df), top_outliers, not_msg)
1513
+ msg = self.bundle.get("target_outliers_warning").format(len(target_outliers_df), top_outliers, not_msg)
1507
1514
  print(msg)
1508
1515
  self.logger.warning(msg)
1509
1516
 
@@ -1529,7 +1536,7 @@ class FeaturesEnricher(TransformerMixin):
1529
1536
  if eval_set is not None:
1530
1537
  if len(enriched_eval_sets) != len(eval_set):
1531
1538
  raise ValidationError(
1532
- bundle.get("metrics_eval_set_count_diff").format(len(enriched_eval_sets), len(eval_set))
1539
+ self.bundle.get("metrics_eval_set_count_diff").format(len(enriched_eval_sets), len(eval_set))
1533
1540
  )
1534
1541
 
1535
1542
  for idx in range(len(eval_set)):
@@ -1680,7 +1687,7 @@ class FeaturesEnricher(TransformerMixin):
1680
1687
  def get_features_info(self) -> pd.DataFrame:
1681
1688
  """Returns pandas.DataFrame with SHAP values and other info for each feature."""
1682
1689
  if self._search_task is None or self._search_task.summary is None:
1683
- msg = bundle.get("features_unfitted_enricher")
1690
+ msg = self.bundle.get("features_unfitted_enricher")
1684
1691
  self.logger.warning(msg)
1685
1692
  raise NotFittedError(msg)
1686
1693
 
@@ -1694,9 +1701,9 @@ class FeaturesEnricher(TransformerMixin):
1694
1701
 
1695
1702
  def get_transactional_transform_api(self):
1696
1703
  if self.api_key is None:
1697
- raise ValidationError(bundle.get("transactional_transform_unregistered"))
1704
+ raise ValidationError(self.bundle.get("transactional_transform_unregistered"))
1698
1705
  if self._search_task is None:
1699
- raise ValidationError(bundle.get("transactional_transform_unfited"))
1706
+ raise ValidationError(self.bundle.get("transactional_transform_unfited"))
1700
1707
 
1701
1708
  def key_example(key: SearchKey):
1702
1709
  if key == SearchKey.COUNTRY:
@@ -1761,7 +1768,7 @@ class FeaturesEnricher(TransformerMixin):
1761
1768
  ) -> pd.DataFrame:
1762
1769
  with MDC(trace_id=trace_id):
1763
1770
  if self._search_task is None:
1764
- raise NotFittedError(bundle.get("transform_unfitted_enricher"))
1771
+ raise NotFittedError(self.bundle.get("transform_unfitted_enricher"))
1765
1772
 
1766
1773
  validated_X = self._validate_X(X, is_transform=True)
1767
1774
 
@@ -1773,13 +1780,13 @@ class FeaturesEnricher(TransformerMixin):
1773
1780
  and not self.__is_registered
1774
1781
  and not is_demo_dataset
1775
1782
  ):
1776
- msg = bundle.get("transform_with_trial_features")
1783
+ msg = self.bundle.get("transform_with_trial_features")
1777
1784
  self.logger.warning(msg)
1778
1785
  print(msg)
1779
1786
 
1780
1787
  columns_to_drop = [c for c in validated_X.columns if c in self.feature_names_]
1781
1788
  if len(columns_to_drop) > 0:
1782
- msg = bundle.get("x_contains_enriching_columns").format(columns_to_drop)
1789
+ msg = self.bundle.get("x_contains_enriching_columns").format(columns_to_drop)
1783
1790
  self.logger.warning(msg)
1784
1791
  print(msg)
1785
1792
  validated_X = validated_X.drop(columns=columns_to_drop)
@@ -1796,7 +1803,7 @@ class FeaturesEnricher(TransformerMixin):
1796
1803
  df = self.__handle_index_search_keys(df, search_keys)
1797
1804
 
1798
1805
  if DEFAULT_INDEX in df.columns:
1799
- msg = bundle.get("unsupported_index_column")
1806
+ msg = self.bundle.get("unsupported_index_column")
1800
1807
  self.logger.info(msg)
1801
1808
  print(msg)
1802
1809
  df.drop(columns=DEFAULT_INDEX, inplace=True)
@@ -1909,9 +1916,9 @@ class FeaturesEnricher(TransformerMixin):
1909
1916
  gc.collect()
1910
1917
 
1911
1918
  if not silent_mode:
1912
- print(bundle.get("polling_search_task").format(validation_task.search_task_id))
1919
+ print(self.bundle.get("polling_search_task").format(validation_task.search_task_id))
1913
1920
  if not self.__is_registered:
1914
- print(bundle.get("polling_unregister_information"))
1921
+ print(self.bundle.get("polling_unregister_information"))
1915
1922
 
1916
1923
  progress = self.get_progress(trace_id, validation_task)
1917
1924
  progress.recalculate_eta(time.time() - start_time)
@@ -1937,10 +1944,10 @@ class FeaturesEnricher(TransformerMixin):
1937
1944
  time.sleep(polling_period_seconds)
1938
1945
  progress = self.get_progress(trace_id, validation_task)
1939
1946
  except KeyboardInterrupt as e:
1940
- print(bundle.get("search_stopping"))
1947
+ print(self.bundle.get("search_stopping"))
1941
1948
  self.rest_client.stop_search_task_v2(trace_id, validation_task.search_task_id)
1942
1949
  self.logger.warning(f"Search {validation_task.search_task_id} stopped by user")
1943
- print(bundle.get("search_stopped"))
1950
+ print(self.bundle.get("search_stopped"))
1944
1951
  raise e
1945
1952
 
1946
1953
  validation_task.poll_result(trace_id, quiet=True)
@@ -1962,7 +1969,7 @@ class FeaturesEnricher(TransformerMixin):
1962
1969
  return res
1963
1970
 
1964
1971
  if not silent_mode:
1965
- print(bundle.get("transform_start"))
1972
+ print(self.bundle.get("transform_start"))
1966
1973
  # with Spinner():
1967
1974
  result = enrich()
1968
1975
  else:
@@ -1976,9 +1983,9 @@ class FeaturesEnricher(TransformerMixin):
1976
1983
 
1977
1984
  def _get_excluded_features(self, max_features: Optional[int], importance_threshold: Optional[float]) -> List[str]:
1978
1985
  features_info = self._internal_features_info
1979
- comm_schema_header = bundle.get("features_info_commercial_schema")
1980
- shap_value_header = bundle.get("features_info_shap")
1981
- feature_name_header = bundle.get("features_info_name")
1986
+ comm_schema_header = self.bundle.get("features_info_commercial_schema")
1987
+ shap_value_header = self.bundle.get("features_info_shap")
1988
+ feature_name_header = self.bundle.get("features_info_name")
1982
1989
  external_features = features_info[features_info[comm_schema_header].str.len() > 0]
1983
1990
  filtered_features = external_features
1984
1991
  if importance_threshold is not None:
@@ -2009,28 +2016,28 @@ class FeaturesEnricher(TransformerMixin):
2009
2016
  return
2010
2017
  else:
2011
2018
  self.logger.warning("search_keys not provided")
2012
- raise ValidationError(bundle.get("empty_search_keys"))
2019
+ raise ValidationError(self.bundle.get("empty_search_keys"))
2013
2020
 
2014
2021
  key_types = search_keys.values()
2015
2022
 
2016
2023
  if SearchKey.DATE in key_types and SearchKey.DATETIME in key_types:
2017
- msg = bundle.get("date_and_datetime_simultanious")
2024
+ msg = self.bundle.get("date_and_datetime_simultanious")
2018
2025
  self.logger.warning(msg)
2019
2026
  raise ValidationError(msg)
2020
2027
 
2021
2028
  if SearchKey.EMAIL in key_types and SearchKey.HEM in key_types:
2022
- msg = bundle.get("email_and_hem_simultanious")
2029
+ msg = self.bundle.get("email_and_hem_simultanious")
2023
2030
  self.logger.warning(msg)
2024
2031
  raise ValidationError(msg)
2025
2032
 
2026
2033
  if SearchKey.POSTAL_CODE in key_types and SearchKey.COUNTRY not in key_types and self.country_code is None:
2027
- msg = bundle.get("postal_code_without_country")
2034
+ msg = self.bundle.get("postal_code_without_country")
2028
2035
  self.logger.warning(msg)
2029
2036
  raise ValidationError(msg)
2030
2037
 
2031
2038
  for key_type in SearchKey.__members__.values():
2032
2039
  if key_type != SearchKey.CUSTOM_KEY and list(key_types).count(key_type) > 1:
2033
- msg = bundle.get("multiple_search_key").format(key_type)
2040
+ msg = self.bundle.get("multiple_search_key").format(key_type)
2034
2041
  self.logger.warning(msg)
2035
2042
  raise ValidationError(msg)
2036
2043
 
@@ -2040,7 +2047,7 @@ class FeaturesEnricher(TransformerMixin):
2040
2047
  # and not is_demo_dataset
2041
2048
  # and len(set(key_types).intersection(non_personal_keys)) == 0
2042
2049
  # ):
2043
- # msg = bundle.get("unregistered_only_personal_keys")
2050
+ # msg = self.bundle.get("unregistered_only_personal_keys")
2044
2051
  # self.logger.warning(msg + f" Provided search keys: {key_types}")
2045
2052
  # raise ValidationError(msg)
2046
2053
 
@@ -2081,7 +2088,7 @@ class FeaturesEnricher(TransformerMixin):
2081
2088
  )
2082
2089
  is_demo_dataset = hash_input(validated_X, validated_y, validated_eval_set) in DEMO_DATASET_HASHES
2083
2090
  if is_demo_dataset:
2084
- msg = bundle.get("demo_dataset_info")
2091
+ msg = self.bundle.get("demo_dataset_info")
2085
2092
  self.logger.info(msg)
2086
2093
  if not self.__is_registered:
2087
2094
  print(msg)
@@ -2091,7 +2098,7 @@ class FeaturesEnricher(TransformerMixin):
2091
2098
  checked_generate_features = []
2092
2099
  for gen_feature in self.generate_features:
2093
2100
  if gen_feature not in x_columns:
2094
- msg = bundle.get("missing_generate_feature").format(gen_feature, x_columns)
2101
+ msg = self.bundle.get("missing_generate_feature").format(gen_feature, x_columns)
2095
2102
  print(msg)
2096
2103
  self.logger.warning(msg)
2097
2104
  else:
@@ -2137,7 +2144,7 @@ class FeaturesEnricher(TransformerMixin):
2137
2144
  df = pd.concat([df, eval_df])
2138
2145
 
2139
2146
  if DEFAULT_INDEX in df.columns:
2140
- msg = bundle.get("unsupported_index_column")
2147
+ msg = self.bundle.get("unsupported_index_column")
2141
2148
  self.logger.info(msg)
2142
2149
  print(msg)
2143
2150
  self.fit_dropped_features.add(DEFAULT_INDEX)
@@ -2240,9 +2247,9 @@ class FeaturesEnricher(TransformerMixin):
2240
2247
  if search_id_callback is not None:
2241
2248
  search_id_callback(self._search_task.search_task_id)
2242
2249
 
2243
- print(bundle.get("polling_search_task").format(self._search_task.search_task_id))
2250
+ print(self.bundle.get("polling_search_task").format(self._search_task.search_task_id))
2244
2251
  if not self.__is_registered:
2245
- print(bundle.get("polling_unregister_information"))
2252
+ print(self.bundle.get("polling_unregister_information"))
2246
2253
 
2247
2254
  progress = self.get_progress(trace_id)
2248
2255
  prev_progress = None
@@ -2268,14 +2275,14 @@ class FeaturesEnricher(TransformerMixin):
2268
2275
  f"Search {self._search_task.search_task_id} failed with error {progress.error}"
2269
2276
  f" and message {progress.error_message}"
2270
2277
  )
2271
- raise RuntimeError(bundle.get("search_task_failed_status"))
2278
+ raise RuntimeError(self.bundle.get("search_task_failed_status"))
2272
2279
  time.sleep(poll_period_seconds)
2273
2280
  progress = self.get_progress(trace_id)
2274
2281
  except KeyboardInterrupt as e:
2275
- print(bundle.get("search_stopping"))
2282
+ print(self.bundle.get("search_stopping"))
2276
2283
  self.rest_client.stop_search_task_v2(trace_id, self._search_task.search_task_id)
2277
2284
  self.logger.warning(f"Search {self._search_task.search_task_id} stopped by user")
2278
- print(bundle.get("search_stopped"))
2285
+ print(self.bundle.get("search_stopped"))
2279
2286
  raise e
2280
2287
 
2281
2288
  self._search_task.poll_result(trace_id, quiet=True)
@@ -2296,7 +2303,7 @@ class FeaturesEnricher(TransformerMixin):
2296
2303
  )
2297
2304
  zero_hit_columns = self.get_columns_by_search_keys(zero_hit_search_keys)
2298
2305
  if zero_hit_columns:
2299
- msg = bundle.get("features_info_zero_hit_rate_search_keys").format(zero_hit_columns)
2306
+ msg = self.bundle.get("features_info_zero_hit_rate_search_keys").format(zero_hit_columns)
2300
2307
  self.logger.warning(msg)
2301
2308
  self.__display_support_link(msg)
2302
2309
  self.warning_counter.increment()
@@ -2308,7 +2315,7 @@ class FeaturesEnricher(TransformerMixin):
2308
2315
  unused_features_for_generation = [
2309
2316
  dataset.columns_renaming.get(col) or col for col in self._search_task.unused_features_for_generation
2310
2317
  ]
2311
- msg = bundle.get("features_not_generated").format(unused_features_for_generation)
2318
+ msg = self.bundle.get("features_not_generated").format(unused_features_for_generation)
2312
2319
  self.logger.warning(msg)
2313
2320
  print(msg)
2314
2321
  self.warning_counter.increment()
@@ -2323,7 +2330,7 @@ class FeaturesEnricher(TransformerMixin):
2323
2330
 
2324
2331
  if self._has_paid_features(exclude_features_sources):
2325
2332
  if calculate_metrics is not None and calculate_metrics:
2326
- msg = bundle.get("metrics_with_paid_features")
2333
+ msg = self.bundle.get("metrics_with_paid_features")
2327
2334
  self.logger.warning(msg)
2328
2335
  self.__display_support_link(msg)
2329
2336
  else:
@@ -2334,7 +2341,7 @@ class FeaturesEnricher(TransformerMixin):
2334
2341
  if len(validated_X) < self.CALCULATE_METRICS_MIN_THRESHOLD or any(
2335
2342
  [len(eval_X) < self.CALCULATE_METRICS_MIN_THRESHOLD for eval_X, _ in validated_eval_set]
2336
2343
  ):
2337
- msg = bundle.get("too_small_for_metrics")
2344
+ msg = self.bundle.get("too_small_for_metrics")
2338
2345
  self.logger.warning(msg)
2339
2346
  calculate_metrics = False
2340
2347
  elif len(dataset) * len(dataset.columns) > self.CALCULATE_METRICS_THRESHOLD:
@@ -2365,7 +2372,7 @@ class FeaturesEnricher(TransformerMixin):
2365
2372
  self.__show_report_button()
2366
2373
 
2367
2374
  if not self.warning_counter.has_warnings():
2368
- self.__display_support_link(bundle.get("all_ok_community_invite"))
2375
+ self.__display_support_link(self.bundle.get("all_ok_community_invite"))
2369
2376
 
2370
2377
  def __adjust_cv(self, df: pd.DataFrame, date_column: pd.Series, model_task_type: ModelTaskType):
2371
2378
  # Check Multivariate time series
@@ -2376,14 +2383,14 @@ class FeaturesEnricher(TransformerMixin):
2376
2383
  and len({SearchKey.PHONE, SearchKey.EMAIL, SearchKey.HEM}.intersection(self.fit_search_keys.keys())) == 0
2377
2384
  and is_blocked_time_series(df, date_column, list(self.fit_search_keys.keys()) + [TARGET])
2378
2385
  ):
2379
- msg = bundle.get("multivariate_timeseries_detected")
2386
+ msg = self.bundle.get("multivariate_timeseries_detected")
2380
2387
  self.__override_cv(CVType.blocked_time_series, msg, print_warning=False)
2381
2388
  elif (
2382
2389
  self.cv is None
2383
2390
  and model_task_type != ModelTaskType.REGRESSION
2384
2391
  and self._get_group_columns(df, self.fit_search_keys)
2385
2392
  ):
2386
- msg = bundle.get("group_k_fold_in_classification")
2393
+ msg = self.bundle.get("group_k_fold_in_classification")
2387
2394
  self.__override_cv(CVType.group_k_fold, msg, print_warning=self.cv is not None)
2388
2395
 
2389
2396
  def __override_cv(self, cv: CVType, msg: str, print_warning: bool = True):
@@ -2403,11 +2410,11 @@ class FeaturesEnricher(TransformerMixin):
2403
2410
 
2404
2411
  def _validate_X(self, X, is_transform=False) -> pd.DataFrame:
2405
2412
  if _num_samples(X) == 0:
2406
- raise ValidationError(bundle.get("x_is_empty"))
2413
+ raise ValidationError(self.bundle.get("x_is_empty"))
2407
2414
 
2408
2415
  if isinstance(X, pd.DataFrame):
2409
2416
  if isinstance(X.columns, pd.MultiIndex) or isinstance(X.index, pd.MultiIndex):
2410
- raise ValidationError(bundle.get("x_multiindex_unsupported"))
2417
+ raise ValidationError(self.bundle.get("x_multiindex_unsupported"))
2411
2418
  validated_X = X.copy()
2412
2419
  elif isinstance(X, pd.Series):
2413
2420
  validated_X = X.to_frame()
@@ -2416,12 +2423,12 @@ class FeaturesEnricher(TransformerMixin):
2416
2423
  renaming = {c: str(c) for c in validated_X.columns}
2417
2424
  validated_X = validated_X.rename(columns=renaming)
2418
2425
  else:
2419
- raise ValidationError(bundle.get("unsupported_x_type").format(type(X)))
2426
+ raise ValidationError(self.bundle.get("unsupported_x_type").format(type(X)))
2420
2427
 
2421
2428
  if len(set(validated_X.columns)) != len(validated_X.columns):
2422
- raise ValidationError(bundle.get("x_contains_dup_columns"))
2429
+ raise ValidationError(self.bundle.get("x_contains_dup_columns"))
2423
2430
  if not is_transform and not validated_X.index.is_unique:
2424
- raise ValidationError(bundle.get("x_non_unique_index"))
2431
+ raise ValidationError(self.bundle.get("x_non_unique_index"))
2425
2432
 
2426
2433
  if self.exclude_columns is not None:
2427
2434
  validated_X = validated_X.drop(columns=self.exclude_columns, errors="ignore")
@@ -2432,17 +2439,17 @@ class FeaturesEnricher(TransformerMixin):
2432
2439
  )
2433
2440
 
2434
2441
  if TARGET in validated_X.columns:
2435
- raise ValidationError(bundle.get("x_contains_reserved_column_name").format(TARGET))
2442
+ raise ValidationError(self.bundle.get("x_contains_reserved_column_name").format(TARGET))
2436
2443
  if not is_transform and EVAL_SET_INDEX in validated_X.columns:
2437
- raise ValidationError(bundle.get("x_contains_reserved_column_name").format(EVAL_SET_INDEX))
2444
+ raise ValidationError(self.bundle.get("x_contains_reserved_column_name").format(EVAL_SET_INDEX))
2438
2445
  if SYSTEM_RECORD_ID in validated_X.columns:
2439
- raise ValidationError(bundle.get("x_contains_reserved_column_name").format(SYSTEM_RECORD_ID))
2446
+ raise ValidationError(self.bundle.get("x_contains_reserved_column_name").format(SYSTEM_RECORD_ID))
2440
2447
 
2441
2448
  return validated_X
2442
2449
 
2443
2450
  def _validate_y(self, X: pd.DataFrame, y) -> pd.Series:
2444
2451
  if _num_samples(y) == 0:
2445
- raise ValidationError(bundle.get("y_is_empty"))
2452
+ raise ValidationError(self.bundle.get("y_is_empty"))
2446
2453
 
2447
2454
  if (
2448
2455
  not isinstance(y, pd.Series)
@@ -2450,26 +2457,26 @@ class FeaturesEnricher(TransformerMixin):
2450
2457
  and not isinstance(y, np.ndarray)
2451
2458
  and not isinstance(y, list)
2452
2459
  ):
2453
- raise ValidationError(bundle.get("unsupported_y_type").format(type(y)))
2460
+ raise ValidationError(self.bundle.get("unsupported_y_type").format(type(y)))
2454
2461
 
2455
2462
  if _num_samples(X) != _num_samples(y):
2456
- raise ValidationError(bundle.get("x_and_y_diff_size").format(_num_samples(X), _num_samples(y)))
2463
+ raise ValidationError(self.bundle.get("x_and_y_diff_size").format(_num_samples(X), _num_samples(y)))
2457
2464
 
2458
2465
  if isinstance(y, pd.DataFrame):
2459
2466
  if len(y.columns) != 1:
2460
- raise ValidationError(bundle.get("y_invalid_dimension_dataframe"))
2467
+ raise ValidationError(self.bundle.get("y_invalid_dimension_dataframe"))
2461
2468
  if isinstance(y.columns, pd.MultiIndex) or isinstance(y.index, pd.MultiIndex):
2462
- raise ValidationError(bundle.get("y_multiindex_unsupported"))
2469
+ raise ValidationError(self.bundle.get("y_multiindex_unsupported"))
2463
2470
  y = y[y.columns[0]]
2464
2471
 
2465
2472
  if isinstance(y, pd.Series):
2466
2473
  if (y.index != X.index).any():
2467
- raise ValidationError(bundle.get("x_and_y_diff_index"))
2474
+ raise ValidationError(self.bundle.get("x_and_y_diff_index"))
2468
2475
  validated_y = y.copy()
2469
2476
  validated_y.rename(TARGET, inplace=True)
2470
2477
  elif isinstance(y, np.ndarray):
2471
2478
  if y.ndim != 1:
2472
- raise ValidationError(bundle.get("y_invalid_dimension_array"))
2479
+ raise ValidationError(self.bundle.get("y_invalid_dimension_array"))
2473
2480
  Xy = X.copy()
2474
2481
  Xy[TARGET] = y
2475
2482
  validated_y = Xy[TARGET].copy()
@@ -2479,24 +2486,24 @@ class FeaturesEnricher(TransformerMixin):
2479
2486
  validated_y = Xy[TARGET].copy()
2480
2487
 
2481
2488
  if validated_y.nunique() < 2:
2482
- raise ValidationError(bundle.get("y_is_constant"))
2489
+ raise ValidationError(self.bundle.get("y_is_constant"))
2483
2490
 
2484
2491
  return validated_y
2485
2492
 
2486
2493
  def _validate_eval_set_pair(self, X: pd.DataFrame, eval_pair: Tuple) -> Tuple[pd.DataFrame, pd.Series]:
2487
2494
  if len(eval_pair) != 2:
2488
- raise ValidationError(bundle.get("eval_set_invalid_tuple_size").format(len(eval_pair)))
2495
+ raise ValidationError(self.bundle.get("eval_set_invalid_tuple_size").format(len(eval_pair)))
2489
2496
  eval_X = eval_pair[0]
2490
2497
  eval_y = eval_pair[1]
2491
2498
 
2492
2499
  if _num_samples(eval_X) == 0:
2493
- raise ValidationError(bundle.get("eval_x_is_empty"))
2500
+ raise ValidationError(self.bundle.get("eval_x_is_empty"))
2494
2501
  if _num_samples(eval_y) == 0:
2495
- raise ValidationError(bundle.get("eval_y_is_empty"))
2502
+ raise ValidationError(self.bundle.get("eval_y_is_empty"))
2496
2503
 
2497
2504
  if isinstance(eval_X, pd.DataFrame):
2498
2505
  if isinstance(eval_X.columns, pd.MultiIndex) or isinstance(eval_X.index, pd.MultiIndex):
2499
- raise ValidationError(bundle.get("eval_x_multiindex_unsupported"))
2506
+ raise ValidationError(self.bundle.get("eval_x_multiindex_unsupported"))
2500
2507
  validated_eval_X = eval_X.copy()
2501
2508
  elif isinstance(eval_X, pd.Series):
2502
2509
  validated_eval_X = eval_X.to_frame()
@@ -2505,10 +2512,10 @@ class FeaturesEnricher(TransformerMixin):
2505
2512
  renaming = {c: str(c) for c in validated_eval_X.columns}
2506
2513
  validated_eval_X = validated_eval_X.rename(columns=renaming)
2507
2514
  else:
2508
- raise ValidationError(bundle.get("unsupported_x_type_eval_set").format(type(eval_X)))
2515
+ raise ValidationError(self.bundle.get("unsupported_x_type_eval_set").format(type(eval_X)))
2509
2516
 
2510
2517
  if not validated_eval_X.index.is_unique:
2511
- raise ValidationError(bundle.get("x_non_unique_index_eval_set"))
2518
+ raise ValidationError(self.bundle.get("x_non_unique_index_eval_set"))
2512
2519
 
2513
2520
  if self.exclude_columns is not None:
2514
2521
  validated_eval_X = validated_eval_X.drop(columns=self.exclude_columns, errors="ignore")
@@ -2522,28 +2529,30 @@ class FeaturesEnricher(TransformerMixin):
2522
2529
  if set(validated_eval_X.columns.to_list()) == set(X.columns.to_list()):
2523
2530
  validated_eval_X = validated_eval_X[X.columns.to_list()]
2524
2531
  else:
2525
- raise ValidationError(bundle.get("eval_x_and_x_diff_shape"))
2532
+ raise ValidationError(self.bundle.get("eval_x_and_x_diff_shape"))
2526
2533
 
2527
2534
  if _num_samples(validated_eval_X) != _num_samples(eval_y):
2528
2535
  raise ValidationError(
2529
- bundle.get("x_and_y_diff_size_eval_set").format(_num_samples(validated_eval_X), _num_samples(eval_y))
2536
+ self.bundle.get("x_and_y_diff_size_eval_set").format(
2537
+ _num_samples(validated_eval_X), _num_samples(eval_y)
2538
+ )
2530
2539
  )
2531
2540
 
2532
2541
  if isinstance(eval_y, pd.DataFrame):
2533
2542
  if len(eval_y.columns) != 1:
2534
- raise ValidationError(bundle.get("y_invalid_dimension_dataframe_eval_set"))
2543
+ raise ValidationError(self.bundle.get("y_invalid_dimension_dataframe_eval_set"))
2535
2544
  if isinstance(eval_y.columns, pd.MultiIndex) or isinstance(eval_y.index, pd.MultiIndex):
2536
- raise ValidationError(bundle.get("eval_y_multiindex_unsupported"))
2545
+ raise ValidationError(self.bundle.get("eval_y_multiindex_unsupported"))
2537
2546
  eval_y = eval_y[eval_y.columns[0]]
2538
2547
 
2539
2548
  if isinstance(eval_y, pd.Series):
2540
2549
  if (eval_y.index != validated_eval_X.index).any():
2541
- raise ValidationError(bundle.get("x_and_y_diff_index_eval_set"))
2550
+ raise ValidationError(self.bundle.get("x_and_y_diff_index_eval_set"))
2542
2551
  validated_eval_y = eval_y.copy()
2543
2552
  validated_eval_y.rename(TARGET, inplace=True)
2544
2553
  elif isinstance(eval_y, np.ndarray):
2545
2554
  if eval_y.ndim != 1:
2546
- raise ValidationError(bundle.get("y_invalid_dimension_array_eval_set"))
2555
+ raise ValidationError(self.bundle.get("y_invalid_dimension_array_eval_set"))
2547
2556
  Xy = validated_eval_X.copy()
2548
2557
  Xy[TARGET] = eval_y
2549
2558
  validated_eval_y = Xy[TARGET].copy()
@@ -2552,27 +2561,29 @@ class FeaturesEnricher(TransformerMixin):
2552
2561
  Xy[TARGET] = eval_y
2553
2562
  validated_eval_y = Xy[TARGET].copy()
2554
2563
  else:
2555
- raise ValidationError(bundle.get("unsupported_y_type_eval_set").format(type(eval_y)))
2564
+ raise ValidationError(self.bundle.get("unsupported_y_type_eval_set").format(type(eval_y)))
2556
2565
 
2557
2566
  if validated_eval_y.nunique() < 2:
2558
- raise ValidationError(bundle.get("y_is_constant_eval_set"))
2567
+ raise ValidationError(self.bundle.get("y_is_constant_eval_set"))
2559
2568
 
2560
2569
  return validated_eval_X, validated_eval_y
2561
2570
 
2562
2571
  def _validate_baseline_score(self, X: pd.DataFrame, eval_set: Optional[List[Tuple]]):
2563
2572
  if self.baseline_score_column is not None:
2564
2573
  if self.baseline_score_column not in X.columns:
2565
- raise ValidationError(bundle.get("baseline_score_column_not_exists").format(self.baseline_score_column))
2574
+ raise ValidationError(
2575
+ self.bundle.get("baseline_score_column_not_exists").format(self.baseline_score_column)
2576
+ )
2566
2577
  if X[self.baseline_score_column].isna().any():
2567
- raise ValidationError(bundle.get("baseline_score_column_has_na"))
2578
+ raise ValidationError(self.bundle.get("baseline_score_column_has_na"))
2568
2579
  if eval_set is not None:
2569
2580
  if isinstance(eval_set, tuple):
2570
2581
  eval_set = [eval_set]
2571
2582
  for eval in eval_set:
2572
2583
  if self.baseline_score_column not in eval[0].columns:
2573
- raise ValidationError(bundle.get("baseline_score_column_not_exists"))
2584
+ raise ValidationError(self.bundle.get("baseline_score_column_not_exists"))
2574
2585
  if eval[0][self.baseline_score_column].isna().any():
2575
- raise ValidationError(bundle.get("baseline_score_column_has_na"))
2586
+ raise ValidationError(self.bundle.get("baseline_score_column_has_na"))
2576
2587
 
2577
2588
  @staticmethod
2578
2589
  def _sample_X_and_y(X: pd.DataFrame, y: pd.Series, enriched_X: pd.DataFrame) -> Tuple[pd.DataFrame, pd.Series]:
@@ -2856,7 +2867,7 @@ class FeaturesEnricher(TransformerMixin):
2856
2867
  ) -> Tuple[pd.DataFrame, Dict[int, pd.DataFrame]]:
2857
2868
  if result_features is None:
2858
2869
  self.logger.error(f"result features not found by search_task_id: {self.get_search_id()}")
2859
- raise RuntimeError(bundle.get("features_wasnt_returned"))
2870
+ raise RuntimeError(self.bundle.get("features_wasnt_returned"))
2860
2871
  result_features = (
2861
2872
  result_features.drop(columns=EVAL_SET_INDEX)
2862
2873
  if EVAL_SET_INDEX in result_features.columns
@@ -2867,7 +2878,7 @@ class FeaturesEnricher(TransformerMixin):
2867
2878
  dup_features = [c for c in comparing_columns if c in result_features.columns and c != SYSTEM_RECORD_ID]
2868
2879
  if len(dup_features) > 0:
2869
2880
  self.logger.warning(f"X contain columns with same name as returned from backend: {dup_features}")
2870
- raise ValidationError(bundle.get("returned_features_same_as_passed").format(dup_features))
2881
+ raise ValidationError(self.bundle.get("returned_features_same_as_passed").format(dup_features))
2871
2882
 
2872
2883
  # index overrites from result_features
2873
2884
  original_index_name = df_with_original_index.index.name
@@ -2927,10 +2938,10 @@ class FeaturesEnricher(TransformerMixin):
2927
2938
 
2928
2939
  def __prepare_feature_importances(self, trace_id: str, x_columns: List[str]):
2929
2940
  if self._search_task is None:
2930
- raise NotFittedError(bundle.get("transform_unfitted_enricher"))
2941
+ raise NotFittedError(self.bundle.get("transform_unfitted_enricher"))
2931
2942
  features_meta = self._search_task.get_all_features_metadata_v2()
2932
2943
  if features_meta is None:
2933
- raise Exception(bundle.get("missing_features_meta"))
2944
+ raise Exception(self.bundle.get("missing_features_meta"))
2934
2945
 
2935
2946
  original_names_dict = {c.name: c.originalName for c in self._search_task.get_file_metadata(trace_id).columns}
2936
2947
  features_df = self._search_task.get_all_initial_raw_features(trace_id, metrics_calculation=True)
@@ -3020,38 +3031,38 @@ class FeaturesEnricher(TransformerMixin):
3020
3031
  )
3021
3032
  features_info.append(
3022
3033
  {
3023
- bundle.get("features_info_name"): feature_name,
3024
- bundle.get("features_info_shap"): round_shap_value(feature_meta.shap_value),
3025
- bundle.get("features_info_hitrate"): feature_meta.hit_rate,
3026
- bundle.get("features_info_value_preview"): feature_sample,
3027
- bundle.get("features_info_provider"): provider,
3028
- bundle.get("features_info_source"): source,
3029
- bundle.get("features_info_commercial_schema"): commercial_schema,
3034
+ self.bundle.get("features_info_name"): feature_name,
3035
+ self.bundle.get("features_info_shap"): round_shap_value(feature_meta.shap_value),
3036
+ self.bundle.get("features_info_hitrate"): feature_meta.hit_rate,
3037
+ self.bundle.get("features_info_value_preview"): feature_sample,
3038
+ self.bundle.get("features_info_provider"): provider,
3039
+ self.bundle.get("features_info_source"): source,
3040
+ self.bundle.get("features_info_commercial_schema"): commercial_schema,
3030
3041
  }
3031
3042
  )
3032
3043
  features_info_without_links.append(
3033
3044
  {
3034
- bundle.get("features_info_name"): internal_feature_name,
3035
- bundle.get("features_info_shap"): round_shap_value(feature_meta.shap_value),
3036
- bundle.get("features_info_hitrate"): feature_meta.hit_rate,
3037
- bundle.get("features_info_value_preview"): feature_sample,
3038
- bundle.get("features_info_provider"): internal_provider,
3039
- bundle.get("features_info_source"): internal_source,
3040
- bundle.get("features_info_commercial_schema"): commercial_schema,
3045
+ self.bundle.get("features_info_name"): internal_feature_name,
3046
+ self.bundle.get("features_info_shap"): round_shap_value(feature_meta.shap_value),
3047
+ self.bundle.get("features_info_hitrate"): feature_meta.hit_rate,
3048
+ self.bundle.get("features_info_value_preview"): feature_sample,
3049
+ self.bundle.get("features_info_provider"): internal_provider,
3050
+ self.bundle.get("features_info_source"): internal_source,
3051
+ self.bundle.get("features_info_commercial_schema"): commercial_schema,
3041
3052
  }
3042
3053
  )
3043
3054
  internal_features_info.append(
3044
3055
  {
3045
- bundle.get("features_info_name"): internal_feature_name,
3056
+ self.bundle.get("features_info_name"): internal_feature_name,
3046
3057
  "feature_link": feature_meta.doc_link,
3047
- bundle.get("features_info_shap"): round_shap_value(feature_meta.shap_value),
3048
- bundle.get("features_info_hitrate"): feature_meta.hit_rate,
3049
- bundle.get("features_info_value_preview"): feature_sample,
3050
- bundle.get("features_info_provider"): internal_provider,
3058
+ self.bundle.get("features_info_shap"): round_shap_value(feature_meta.shap_value),
3059
+ self.bundle.get("features_info_hitrate"): feature_meta.hit_rate,
3060
+ self.bundle.get("features_info_value_preview"): feature_sample,
3061
+ self.bundle.get("features_info_provider"): internal_provider,
3051
3062
  "provider_link": feature_meta.data_provider_link,
3052
- bundle.get("features_info_source"): internal_source,
3063
+ self.bundle.get("features_info_source"): internal_source,
3053
3064
  "source_link": feature_meta.data_source_link,
3054
- bundle.get("features_info_commercial_schema"): feature_meta.commercial_schema or "",
3065
+ self.bundle.get("features_info_commercial_schema"): feature_meta.commercial_schema or "",
3055
3066
  }
3056
3067
  )
3057
3068
 
@@ -3061,8 +3072,10 @@ class FeaturesEnricher(TransformerMixin):
3061
3072
  self._internal_features_info = pd.DataFrame(internal_features_info)
3062
3073
  do_without_pandas_limits(lambda: self.logger.info(f"Features info:\n{self._internal_features_info}"))
3063
3074
 
3064
- self.relevant_data_sources = self._group_relevant_data_sources(self.features_info)
3065
- self._relevant_data_sources_wo_links = self._group_relevant_data_sources(self._features_info_without_links)
3075
+ self.relevant_data_sources = self._group_relevant_data_sources(self.features_info, self.bundle)
3076
+ self._relevant_data_sources_wo_links = self._group_relevant_data_sources(
3077
+ self._features_info_without_links, self.bundle
3078
+ )
3066
3079
  do_without_pandas_limits(
3067
3080
  lambda: self.logger.info(f"Relevant data sources:\n{self._relevant_data_sources_wo_links}")
3068
3081
  )
@@ -3122,7 +3135,7 @@ class FeaturesEnricher(TransformerMixin):
3122
3135
  return None
3123
3136
 
3124
3137
  @staticmethod
3125
- def _group_relevant_data_sources(df: pd.DataFrame) -> pd.DataFrame:
3138
+ def _group_relevant_data_sources(df: pd.DataFrame, bundle: ResourceBundle) -> pd.DataFrame:
3126
3139
  return (
3127
3140
  df.query(f"{bundle.get('features_info_provider')} != ''")
3128
3141
  .groupby([bundle.get("features_info_provider"), bundle.get("features_info_source")])
@@ -3177,31 +3190,31 @@ class FeaturesEnricher(TransformerMixin):
3177
3190
  }
3178
3191
  passed_unsupported_search_keys = unsupported_search_keys.intersection(search_keys.values())
3179
3192
  if len(passed_unsupported_search_keys) > 0:
3180
- raise ValidationError(bundle.get("unsupported_search_key").format(passed_unsupported_search_keys))
3193
+ raise ValidationError(self.bundle.get("unsupported_search_key").format(passed_unsupported_search_keys))
3181
3194
 
3182
3195
  for column_id, meaning_type in search_keys.items():
3183
3196
  column_name = None
3184
3197
  if isinstance(column_id, str):
3185
3198
  if column_id not in x.columns:
3186
- raise ValidationError(bundle.get("search_key_not_found").format(column_id, list(x.columns)))
3199
+ raise ValidationError(self.bundle.get("search_key_not_found").format(column_id, list(x.columns)))
3187
3200
  column_name = column_id
3188
3201
  valid_search_keys[column_name] = meaning_type
3189
3202
  elif isinstance(column_id, int):
3190
3203
  if column_id >= x.shape[1]:
3191
- raise ValidationError(bundle.get("numeric_search_key_not_found").format(column_id, x.shape[1]))
3204
+ raise ValidationError(self.bundle.get("numeric_search_key_not_found").format(column_id, x.shape[1]))
3192
3205
  column_name = x.columns[column_id]
3193
3206
  valid_search_keys[column_name] = meaning_type
3194
3207
  else:
3195
- raise ValidationError(bundle.get("unsupported_search_key_type").format(type(column_id)))
3208
+ raise ValidationError(self.bundle.get("unsupported_search_key_type").format(type(column_id)))
3196
3209
 
3197
3210
  if meaning_type == SearchKey.COUNTRY and self.country_code is not None:
3198
- msg = bundle.get("search_key_country_and_country_code")
3211
+ msg = self.bundle.get("search_key_country_and_country_code")
3199
3212
  self.logger.warning(msg)
3200
3213
  print(msg)
3201
3214
  self.country_code = None
3202
3215
 
3203
3216
  if not self.__is_registered and not is_demo_dataset and meaning_type in SearchKey.personal_keys():
3204
- msg = bundle.get("unregistered_with_personal_keys").format(meaning_type)
3217
+ msg = self.bundle.get("unregistered_with_personal_keys").format(meaning_type)
3205
3218
  self.logger.warning(msg)
3206
3219
  if not silent_mode:
3207
3220
  self.warning_counter.increment()
@@ -3212,7 +3225,7 @@ class FeaturesEnricher(TransformerMixin):
3212
3225
  if x[column_name].isnull().all() or (
3213
3226
  is_string_dtype(x[column_name]) and (x[column_name].astype("string").str.strip() == "").all()
3214
3227
  ):
3215
- raise ValidationError(bundle.get("empty_search_key").format(column_name))
3228
+ raise ValidationError(self.bundle.get("empty_search_key").format(column_name))
3216
3229
 
3217
3230
  if self.detect_missing_search_keys and (
3218
3231
  not is_transform or set(valid_search_keys.values()) != set(self.fit_search_keys.values())
@@ -3222,7 +3235,7 @@ class FeaturesEnricher(TransformerMixin):
3222
3235
  )
3223
3236
 
3224
3237
  if all(k == SearchKey.CUSTOM_KEY for k in valid_search_keys.values()):
3225
- msg = bundle.get("unregistered_only_personal_keys")
3238
+ msg = self.bundle.get("unregistered_only_personal_keys")
3226
3239
  self.logger.warning(msg + f" Provided search keys: {search_keys}")
3227
3240
  raise ValidationError(msg)
3228
3241
 
@@ -3237,7 +3250,7 @@ class FeaturesEnricher(TransformerMixin):
3237
3250
  and next(iter(valid_search_keys.values())) == SearchKey.DATE
3238
3251
  and not silent_mode
3239
3252
  ):
3240
- msg = bundle.get("date_only_search")
3253
+ msg = self.bundle.get("date_only_search")
3241
3254
  print(msg)
3242
3255
  self.logger.warning(msg)
3243
3256
  self.warning_counter.increment()
@@ -3246,7 +3259,7 @@ class FeaturesEnricher(TransformerMixin):
3246
3259
  if (self.cv is None or self.cv == CVType.k_fold) and len(maybe_date) > 0 and not silent_mode:
3247
3260
  date_column = next(iter(maybe_date))
3248
3261
  if x[date_column].nunique() > 0.9 * _num_samples(x):
3249
- msg = bundle.get("date_search_without_time_series")
3262
+ msg = self.bundle.get("date_search_without_time_series")
3250
3263
  print(msg)
3251
3264
  self.logger.warning(msg)
3252
3265
  self.warning_counter.increment()
@@ -3255,7 +3268,7 @@ class FeaturesEnricher(TransformerMixin):
3255
3268
  for k, v in valid_search_keys.items():
3256
3269
  # Show warning for country only if country is the only key
3257
3270
  if x[k].nunique() == 1 and (v != SearchKey.COUNTRY or len(valid_search_keys) == 1):
3258
- msg = bundle.get("single_constant_search_key").format(v, x[k].values[0])
3271
+ msg = self.bundle.get("single_constant_search_key").format(v, x[k].values[0])
3259
3272
  print(msg)
3260
3273
  self.logger.warning(msg)
3261
3274
  self.warning_counter.increment()
@@ -3287,11 +3300,11 @@ class FeaturesEnricher(TransformerMixin):
3287
3300
  progress_callback=progress_callback,
3288
3301
  )
3289
3302
  if self.metrics is not None:
3290
- msg = bundle.get("quality_metrics_header")
3303
+ msg = self.bundle.get("quality_metrics_header")
3291
3304
  display_html_dataframe(self.metrics, self.metrics, msg)
3292
3305
 
3293
3306
  def __show_selected_features(self, search_keys: Dict[str, SearchKey]):
3294
- msg = bundle.get("features_info_header").format(len(self.feature_names_), list(search_keys.keys()))
3307
+ msg = self.bundle.get("features_info_header").format(len(self.feature_names_), list(search_keys.keys()))
3295
3308
 
3296
3309
  try:
3297
3310
  _ = get_ipython() # type: ignore
@@ -3300,16 +3313,16 @@ class FeaturesEnricher(TransformerMixin):
3300
3313
  self.logger.info(msg)
3301
3314
  if len(self.feature_names_) > 0:
3302
3315
  display_html_dataframe(
3303
- self.features_info, self._features_info_without_links, bundle.get("relevant_features_header")
3316
+ self.features_info, self._features_info_without_links, self.bundle.get("relevant_features_header")
3304
3317
  )
3305
3318
 
3306
3319
  display_html_dataframe(
3307
3320
  self.relevant_data_sources,
3308
3321
  self._relevant_data_sources_wo_links,
3309
- bundle.get("relevant_data_sources_header"),
3322
+ self.bundle.get("relevant_data_sources_header"),
3310
3323
  )
3311
3324
  else:
3312
- msg = bundle.get("features_info_zero_important_features")
3325
+ msg = self.bundle.get("features_info_zero_important_features")
3313
3326
  self.logger.warning(msg)
3314
3327
  self.__display_support_link(msg)
3315
3328
  self.warning_counter.increment()
@@ -3336,14 +3349,14 @@ class FeaturesEnricher(TransformerMixin):
3336
3349
  return float(importance_threshold) if importance_threshold is not None else 0.0
3337
3350
  except ValueError:
3338
3351
  self.logger.exception(f"Invalid importance_threshold provided: {importance_threshold}")
3339
- raise ValidationError(bundle.get("invalid_importance_threshold"))
3352
+ raise ValidationError(self.bundle.get("invalid_importance_threshold"))
3340
3353
 
3341
3354
  def __validate_max_features(self, max_features: Optional[int]) -> int:
3342
3355
  try:
3343
3356
  return int(max_features) if max_features is not None else 400
3344
3357
  except ValueError:
3345
3358
  self.logger.exception(f"Invalid max_features provided: {max_features}")
3346
- raise ValidationError(bundle.get("invalid_max_features"))
3359
+ raise ValidationError(self.bundle.get("invalid_max_features"))
3347
3360
 
3348
3361
  def __filtered_enriched_features(
3349
3362
  self,
@@ -3375,7 +3388,7 @@ class FeaturesEnricher(TransformerMixin):
3375
3388
  self.autodetected_search_keys[maybe_key] = SearchKey.POSTAL_CODE
3376
3389
  self.logger.info(f"Autodetected search key POSTAL_CODE in column {maybe_key}")
3377
3390
  if not silent_mode:
3378
- print(bundle.get("postal_code_detected").format(maybe_key))
3391
+ print(self.bundle.get("postal_code_detected").format(maybe_key))
3379
3392
 
3380
3393
  if (
3381
3394
  SearchKey.COUNTRY not in search_keys.values()
@@ -3388,7 +3401,7 @@ class FeaturesEnricher(TransformerMixin):
3388
3401
  self.autodetected_search_keys[maybe_key] = SearchKey.COUNTRY
3389
3402
  self.logger.info(f"Autodetected search key COUNTRY in column {maybe_key}")
3390
3403
  if not silent_mode:
3391
- print(bundle.get("country_detected").format(maybe_key))
3404
+ print(self.bundle.get("country_detected").format(maybe_key))
3392
3405
 
3393
3406
  if (
3394
3407
  SearchKey.EMAIL not in search_keys.values()
@@ -3402,13 +3415,13 @@ class FeaturesEnricher(TransformerMixin):
3402
3415
  self.autodetected_search_keys[maybe_key] = SearchKey.EMAIL
3403
3416
  self.logger.info(f"Autodetected search key EMAIL in column {maybe_key}")
3404
3417
  if not silent_mode:
3405
- print(bundle.get("email_detected").format(maybe_key))
3418
+ print(self.bundle.get("email_detected").format(maybe_key))
3406
3419
  else:
3407
3420
  self.logger.warning(
3408
3421
  f"Autodetected search key EMAIL in column {maybe_key}. But not used because not registered user"
3409
3422
  )
3410
3423
  if not silent_mode:
3411
- print(bundle.get("email_detected_not_registered").format(maybe_key))
3424
+ print(self.bundle.get("email_detected_not_registered").format(maybe_key))
3412
3425
  self.warning_counter.increment()
3413
3426
 
3414
3427
  if SearchKey.PHONE not in search_keys.values() and check_need_detect(SearchKey.PHONE):
@@ -3419,20 +3432,20 @@ class FeaturesEnricher(TransformerMixin):
3419
3432
  self.autodetected_search_keys[maybe_key] = SearchKey.PHONE
3420
3433
  self.logger.info(f"Autodetected search key PHONE in column {maybe_key}")
3421
3434
  if not silent_mode:
3422
- print(bundle.get("phone_detected").format(maybe_key))
3435
+ print(self.bundle.get("phone_detected").format(maybe_key))
3423
3436
  else:
3424
3437
  self.logger.warning(
3425
3438
  f"Autodetected search key PHONE in column {maybe_key}. But not used because not registered user"
3426
3439
  )
3427
3440
  if not silent_mode:
3428
- print(bundle.get("phone_detected_not_registered"))
3441
+ print(self.bundle.get("phone_detected_not_registered"))
3429
3442
  self.warning_counter.increment()
3430
3443
 
3431
3444
  return search_keys
3432
3445
 
3433
3446
  def _validate_binary_observations(self, y, task_type: ModelTaskType):
3434
3447
  if task_type == ModelTaskType.BINARY and (y.value_counts() < 1000).any():
3435
- msg = bundle.get("binary_small_dataset")
3448
+ msg = self.bundle.get("binary_small_dataset")
3436
3449
  self.logger.warning(msg)
3437
3450
  print(msg)
3438
3451
 
@@ -3447,8 +3460,8 @@ class FeaturesEnricher(TransformerMixin):
3447
3460
  self.logger.exception("Failed to dump python libs")
3448
3461
 
3449
3462
  def __display_support_link(self, link_text: Optional[str] = None):
3450
- support_link = bundle.get("support_link")
3451
- link_text = link_text or bundle.get("support_text")
3463
+ support_link = self.bundle.get("support_link")
3464
+ link_text = link_text or self.bundle.get("support_text")
3452
3465
  try:
3453
3466
  from IPython.display import HTML, display
3454
3467
 
@@ -3564,7 +3577,7 @@ def _num_samples(x):
3564
3577
  raise TypeError(message) from type_error
3565
3578
 
3566
3579
 
3567
- def is_frames_equal(first, second) -> bool:
3580
+ def is_frames_equal(first, second, bundle: ResourceBundle) -> bool:
3568
3581
  if (isinstance(first, pd.DataFrame) and isinstance(second, pd.DataFrame)) or (
3569
3582
  isinstance(first, pd.Series) and isinstance(second, pd.Series)
3570
3583
  ):