upgini 1.2.122a4__py3-none-any.whl → 1.2.146a4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

@@ -42,8 +42,10 @@ from upgini.http import (
42
42
  get_rest_client,
43
43
  )
44
44
  from upgini.mdc import MDC
45
+ from upgini.mdc.context import get_mdc_fields
45
46
  from upgini.metadata import (
46
47
  COUNTRY,
48
+ CURRENT_DATE_COL,
47
49
  DEFAULT_INDEX,
48
50
  ENTITY_SYSTEM_RECORD_ID,
49
51
  EVAL_SET_INDEX,
@@ -76,7 +78,8 @@ from upgini.utils.custom_loss_utils import (
76
78
  )
77
79
  from upgini.utils.cv_utils import CVConfig, get_groups
78
80
  from upgini.utils.datetime_utils import (
79
- DateTimeSearchKeyConverter,
81
+ DateSearchKeyDetector,
82
+ DateTimeConverter,
80
83
  is_blocked_time_series,
81
84
  is_dates_distribution_valid,
82
85
  is_time_series,
@@ -167,7 +170,6 @@ class FeaturesEnricher(TransformerMixin):
167
170
  """
168
171
 
169
172
  TARGET_NAME = "target"
170
- CURRENT_DATE = "current_date"
171
173
  RANDOM_STATE = 42
172
174
  CALCULATE_METRICS_THRESHOLD = 50_000_000
173
175
  CALCULATE_METRICS_MIN_THRESHOLD = 500
@@ -220,7 +222,9 @@ class FeaturesEnricher(TransformerMixin):
220
222
  cv: CVType | None = None,
221
223
  loss: str | None = None,
222
224
  autodetect_search_keys: bool = True,
225
+ # deprecated, use text_features instead
223
226
  generate_features: list[str] | None = None,
227
+ text_features: list[str] | None = None,
224
228
  columns_for_online_api: list[str] | None = None,
225
229
  round_embeddings: int | None = None,
226
230
  logs_enabled: bool = True,
@@ -236,6 +240,7 @@ class FeaturesEnricher(TransformerMixin):
236
240
  generate_search_key_features: bool = True,
237
241
  sample_config: SampleConfig | None = None,
238
242
  print_trace_id: bool = False,
243
+ print_loaded_report: bool = True,
239
244
  **kwargs,
240
245
  ):
241
246
  self.bundle = get_custom_bundle(custom_bundle_config)
@@ -269,7 +274,7 @@ class FeaturesEnricher(TransformerMixin):
269
274
  self.X: pd.DataFrame | None = None
270
275
  self.y: pd.Series | None = None
271
276
  self.eval_set: list[tuple] | None = None
272
- self.autodetected_search_keys: dict[str, SearchKey] = {}
277
+ self.autodetected_search_keys: dict[str, SearchKey] | None = None
273
278
  self.imbalanced = False
274
279
  self.fit_select_features = True
275
280
  self.__cached_sampled_datasets: dict[str, tuple[pd.DataFrame, pd.DataFrame, pd.Series, dict, dict, dict]] = (
@@ -282,11 +287,9 @@ class FeaturesEnricher(TransformerMixin):
282
287
  self.id_columns = id_columns
283
288
  self.id_columns_encoder = None
284
289
  self.country_code = country_code
285
- self.__validate_search_keys(search_keys, search_id)
290
+ self.__validate_search_keys(self.search_keys, search_id)
286
291
 
287
- if model_task_type is not None:
288
- self.model_task_type = ModelTaskType.parse(model_task_type)
289
- self.model_task_type = model_task_type
292
+ self.model_task_type = ModelTaskType.parse(model_task_type)
290
293
  self.endpoint = endpoint
291
294
  self._search_task: SearchTask | None = None
292
295
  self.features_info: pd.DataFrame = self.EMPTY_FEATURES_INFO
@@ -307,10 +310,8 @@ class FeaturesEnricher(TransformerMixin):
307
310
  search_task = SearchTask(search_id, rest_client=self.rest_client, logger=self.logger)
308
311
 
309
312
  print(self.bundle.get("search_by_task_id_start"))
310
- trace_id = str(uuid.uuid4())
311
- if self.print_trace_id:
312
- print(f"https://app.datadoghq.eu/logs?query=%40trace_id%3A{trace_id}")
313
- with MDC(trace_id=trace_id):
313
+ trace_id = self._get_trace_id()
314
+ with MDC(correlation_id=trace_id, search_task_id=search_id):
314
315
  try:
315
316
  self.logger.debug(f"FeaturesEnricher created from existing search: {search_id}")
316
317
  self._search_task = search_task.poll_result(trace_id, quiet=True, check_fit=True)
@@ -318,8 +319,9 @@ class FeaturesEnricher(TransformerMixin):
318
319
  x_columns = [c.name for c in file_metadata.columns]
319
320
  self.fit_columns_renaming = {c.name: c.originalName for c in file_metadata.columns}
320
321
  df = pd.DataFrame(columns=x_columns)
321
- self.__prepare_feature_importances(trace_id, df, silent=True, update_selected_features=False)
322
- self.__show_selected_features()
322
+ self.__prepare_feature_importances(df, silent=True, update_selected_features=False)
323
+ if print_loaded_report:
324
+ self.__show_selected_features()
323
325
  # TODO validate search_keys with search_keys from file_metadata
324
326
  print(self.bundle.get("search_by_task_id_finish"))
325
327
  self.logger.debug(f"Successfully initialized with search_id: {search_id}")
@@ -344,14 +346,14 @@ class FeaturesEnricher(TransformerMixin):
344
346
  self.shared_datasets = shared_datasets
345
347
  if shared_datasets is not None:
346
348
  self.runtime_parameters.properties["shared_datasets"] = ",".join(shared_datasets)
347
- self.generate_features = generate_features
349
+ self.generate_features = text_features or generate_features
348
350
  self.round_embeddings = round_embeddings
349
- if generate_features is not None:
350
- if len(generate_features) > self.GENERATE_FEATURES_LIMIT:
351
+ if self.generate_features is not None:
352
+ if len(self.generate_features) > self.GENERATE_FEATURES_LIMIT:
351
353
  msg = self.bundle.get("too_many_generate_features").format(self.GENERATE_FEATURES_LIMIT)
352
354
  self.logger.error(msg)
353
355
  raise ValidationError(msg)
354
- self.runtime_parameters.properties["generate_features"] = ",".join(generate_features)
356
+ self.runtime_parameters.properties["generate_features"] = ",".join(self.generate_features)
355
357
  if round_embeddings is not None:
356
358
  if not isinstance(round_embeddings, int) or round_embeddings < 0:
357
359
  msg = self.bundle.get("invalid_round_embeddings")
@@ -486,9 +488,9 @@ class FeaturesEnricher(TransformerMixin):
486
488
  stability_agg_func: str, optional (default="max")
487
489
  Function to aggregate stability values. Can be "max", "min", "mean".
488
490
  """
489
- trace_id = str(uuid.uuid4())
491
+ trace_id = self._get_trace_id()
490
492
  if self.print_trace_id:
491
- print(f"https://app.datadoghq.eu/logs?query=%40trace_id%3A{trace_id}")
493
+ print(f"https://app.datadoghq.eu/logs?query=%40correlation_id%3A{trace_id}")
492
494
  start_time = time.time()
493
495
  auto_fe_parameters = AutoFEParameters() if auto_fe_parameters is None else auto_fe_parameters
494
496
  search_progress = SearchProgress(0.0, ProgressStage.START_FIT)
@@ -500,7 +502,7 @@ class FeaturesEnricher(TransformerMixin):
500
502
  progress_bar.progress = search_progress.to_progress_bar()
501
503
  progress_bar.display()
502
504
 
503
- with MDC(trace_id=trace_id):
505
+ with MDC(correlation_id=trace_id):
504
506
  if len(args) > 0:
505
507
  msg = f"WARNING: Unsupported positional arguments for fit: {args}"
506
508
  self.logger.warning(msg)
@@ -521,10 +523,9 @@ class FeaturesEnricher(TransformerMixin):
521
523
  self.X = X
522
524
  self.y = y
523
525
  self.eval_set = self._check_eval_set(eval_set, X)
524
- self.dump_input(trace_id, X, y, self.eval_set)
526
+ self.dump_input(X, y, self.eval_set)
525
527
  self.__set_select_features(select_features)
526
528
  self.__inner_fit(
527
- trace_id,
528
529
  X,
529
530
  y,
530
531
  self.eval_set,
@@ -645,11 +646,11 @@ class FeaturesEnricher(TransformerMixin):
645
646
 
646
647
  self.warning_counter.reset()
647
648
  auto_fe_parameters = AutoFEParameters() if auto_fe_parameters is None else auto_fe_parameters
648
- trace_id = str(uuid.uuid4())
649
+ trace_id = self._get_trace_id()
649
650
  if self.print_trace_id:
650
- print(f"https://app.datadoghq.eu/logs?query=%40trace_id%3A{trace_id}")
651
+ print(f"https://app.datadoghq.eu/logs?query=%40correlation_id%3A{trace_id}")
651
652
  start_time = time.time()
652
- with MDC(trace_id=trace_id):
653
+ with MDC(correlation_id=trace_id):
653
654
  if len(args) > 0:
654
655
  msg = f"WARNING: Unsupported positional arguments for fit_transform: {args}"
655
656
  self.logger.warning(msg)
@@ -676,13 +677,9 @@ class FeaturesEnricher(TransformerMixin):
676
677
  self.y = y
677
678
  self.eval_set = self._check_eval_set(eval_set, X)
678
679
  self.__set_select_features(select_features)
679
- self.dump_input(trace_id, X, y, self.eval_set)
680
-
681
- if _num_samples(drop_duplicates(X)) > Dataset.MAX_ROWS:
682
- raise ValidationError(self.bundle.get("dataset_too_many_rows_registered").format(Dataset.MAX_ROWS))
680
+ self.dump_input(X, y, self.eval_set)
683
681
 
684
682
  self.__inner_fit(
685
- trace_id,
686
683
  X,
687
684
  y,
688
685
  self.eval_set,
@@ -734,9 +731,9 @@ class FeaturesEnricher(TransformerMixin):
734
731
 
735
732
  result = self.transform(
736
733
  X,
734
+ y,
737
735
  exclude_features_sources=exclude_features_sources,
738
736
  keep_input=keep_input,
739
- trace_id=trace_id,
740
737
  silent_mode=True,
741
738
  progress_bar=progress_bar,
742
739
  progress_callback=progress_callback,
@@ -747,12 +744,10 @@ class FeaturesEnricher(TransformerMixin):
747
744
  def transform(
748
745
  self,
749
746
  X: pd.DataFrame,
750
- *args,
751
747
  y: pd.Series | None = None,
748
+ *args,
752
749
  exclude_features_sources: list[str] | None = None,
753
750
  keep_input: bool = True,
754
- trace_id: str | None = None,
755
- metrics_calculation: bool = False,
756
751
  silent_mode=False,
757
752
  progress_bar: ProgressBar | None = None,
758
753
  progress_callback: Callable[[SearchProgress], Any] | None = None,
@@ -789,10 +784,12 @@ class FeaturesEnricher(TransformerMixin):
789
784
  progress_bar.progress = search_progress.to_progress_bar()
790
785
  if new_progress:
791
786
  progress_bar.display()
792
- trace_id = trace_id or str(uuid.uuid4())
787
+ trace_id = self._get_trace_id()
788
+ if self.print_trace_id:
789
+ print(f"https://app.datadoghq.eu/logs?query=%40correlation_id%3A{trace_id}")
793
790
  search_id = self.search_id or (self._search_task.search_task_id if self._search_task is not None else None)
794
- with MDC(trace_id=trace_id, search_id=search_id):
795
- self.dump_input(trace_id, X)
791
+ with MDC(correlation_id=trace_id, search_id=search_id):
792
+ self.dump_input(X)
796
793
  if len(args) > 0:
797
794
  msg = f"WARNING: Unsupported positional arguments for transform: {args}"
798
795
  self.logger.warning(msg)
@@ -805,15 +802,15 @@ class FeaturesEnricher(TransformerMixin):
805
802
  start_time = time.time()
806
803
  try:
807
804
  result, _, _, _ = self.__inner_transform(
808
- trace_id,
809
805
  X,
810
806
  y=y,
811
807
  exclude_features_sources=exclude_features_sources,
812
- metrics_calculation=metrics_calculation,
813
808
  silent_mode=silent_mode,
814
809
  progress_bar=progress_bar,
815
810
  keep_input=keep_input,
816
811
  )
812
+ if result is not None and TARGET in result.columns:
813
+ result = result.drop(columns=TARGET)
817
814
  self.logger.info("Transform finished successfully")
818
815
  search_progress = SearchProgress(100.0, ProgressStage.FINISHED)
819
816
  if progress_bar is not None:
@@ -868,7 +865,6 @@ class FeaturesEnricher(TransformerMixin):
868
865
  estimator=None,
869
866
  exclude_features_sources: list[str] | None = None,
870
867
  remove_outliers_calc_metrics: bool | None = None,
871
- trace_id: str | None = None,
872
868
  internal_call: bool = False,
873
869
  progress_bar: ProgressBar | None = None,
874
870
  progress_callback: Callable[[SearchProgress], Any] | None = None,
@@ -906,10 +902,10 @@ class FeaturesEnricher(TransformerMixin):
906
902
  Dataframe with metrics calculated on train and validation datasets.
907
903
  """
908
904
 
909
- trace_id = trace_id or str(uuid.uuid4())
905
+ trace_id = self._get_trace_id()
910
906
  start_time = time.time()
911
907
  search_id = self.search_id or (self._search_task.search_task_id if self._search_task is not None else None)
912
- with MDC(trace_id=trace_id, search_id=search_id):
908
+ with MDC(correlation_id=trace_id, search_id=search_id):
913
909
  self.logger.info("Start calculate metrics")
914
910
  if len(args) > 0:
915
911
  msg = f"WARNING: Unsupported positional arguments for calculate_metrics: {args}"
@@ -939,7 +935,7 @@ class FeaturesEnricher(TransformerMixin):
939
935
  raise ValidationError(self.bundle.get("metrics_unfitted_enricher"))
940
936
 
941
937
  validated_X, validated_y, validated_eval_set = self._validate_train_eval(
942
- effective_X, effective_y, effective_eval_set, silent=internal_call
938
+ effective_X, effective_y, effective_eval_set
943
939
  )
944
940
 
945
941
  if self.X is None:
@@ -974,11 +970,13 @@ class FeaturesEnricher(TransformerMixin):
974
970
  self.__display_support_link(msg)
975
971
  return None
976
972
 
973
+ search_keys = self._get_fit_search_keys_with_original_names()
974
+
977
975
  cat_features_from_backend = self.__get_categorical_features()
978
976
  # Convert to original names
979
977
  cat_features_from_backend = [self.fit_columns_renaming.get(c, c) for c in cat_features_from_backend]
980
978
  client_cat_features, search_keys_for_metrics = self._get_and_validate_client_cat_features(
981
- estimator, validated_X, self.search_keys
979
+ estimator, validated_X, search_keys
982
980
  )
983
981
  # Exclude id columns from cat_features
984
982
  if self.id_columns and self.id_columns_encoder is not None:
@@ -1000,7 +998,6 @@ class FeaturesEnricher(TransformerMixin):
1000
998
  self.logger.info(f"Search keys for metrics: {search_keys_for_metrics}")
1001
999
 
1002
1000
  prepared_data = self._get_cached_enriched_data(
1003
- trace_id=trace_id,
1004
1001
  X=X,
1005
1002
  y=y,
1006
1003
  eval_set=eval_set,
@@ -1046,7 +1043,8 @@ class FeaturesEnricher(TransformerMixin):
1046
1043
  with Spinner():
1047
1044
  self._check_train_and_eval_target_distribution(y_sorted, fitting_eval_set_dict)
1048
1045
 
1049
- has_date = self._get_date_column(search_keys) is not None
1046
+ date_col = self._get_date_column(search_keys)
1047
+ has_date = date_col is not None and date_col in validated_X.columns
1050
1048
  model_task_type = self.model_task_type or define_task(y_sorted, has_date, self.logger, silent=True)
1051
1049
  cat_features = list(set(client_cat_features + cat_features_from_backend))
1052
1050
  has_time = has_date and isinstance(_cv, TimeSeriesSplit) or isinstance(_cv, BlockedTimeSeriesSplit)
@@ -1252,7 +1250,7 @@ class FeaturesEnricher(TransformerMixin):
1252
1250
 
1253
1251
  if updating_shaps is not None:
1254
1252
  decoded_X = self._decode_id_columns(fitting_X)
1255
- self._update_shap_values(trace_id, decoded_X, updating_shaps, silent=not internal_call)
1253
+ self._update_shap_values(decoded_X, updating_shaps, silent=not internal_call)
1256
1254
 
1257
1255
  metrics_df = pd.DataFrame(metrics)
1258
1256
  mean_target_hdr = self.bundle.get("quality_metrics_mean_target_header")
@@ -1302,9 +1300,40 @@ class FeaturesEnricher(TransformerMixin):
1302
1300
  finally:
1303
1301
  self.logger.info(f"Calculating metrics elapsed time: {time.time() - start_time}")
1304
1302
 
1303
+ def _get_trace_id(self):
1304
+ if get_mdc_fields().get("correlation_id") is not None:
1305
+ return get_mdc_fields().get("correlation_id")
1306
+ return int(time.time() * 1000)
1307
+
1308
+ def _get_autodetected_search_keys(self):
1309
+ if self.autodetected_search_keys is None and self._search_task is not None:
1310
+ meta = self._search_task.get_file_metadata(self._get_trace_id())
1311
+ autodetected_search_keys = meta.autodetectedSearchKeys or {}
1312
+ self.autodetected_search_keys = {k: SearchKey[v] for k, v in autodetected_search_keys.items()}
1313
+
1314
+ return self.autodetected_search_keys
1315
+
1316
+ def _add_autodetected_search_keys(self, adding_search_keys: dict[str, SearchKey]):
1317
+ if self.autodetected_search_keys is None:
1318
+ self.autodetected_search_keys = dict()
1319
+ self.autodetected_search_keys.update(adding_search_keys)
1320
+ return self.autodetected_search_keys
1321
+
1322
+ def _get_fit_search_keys_with_original_names(self):
1323
+ if self.fit_search_keys is None and self._search_task is not None:
1324
+ fit_search_keys = dict()
1325
+ meta = self._search_task.get_file_metadata(self._get_trace_id())
1326
+ for column in meta.columns:
1327
+ # TODO check for EMAIL->HEM and multikeys
1328
+ search_key_type = SearchKey.from_meaning_type(column.meaningType)
1329
+ if search_key_type is not None:
1330
+ fit_search_keys[column.originalName] = search_key_type
1331
+ else:
1332
+ fit_search_keys = {self.fit_columns_renaming.get(k, k): v for k, v in self.fit_search_keys.items()}
1333
+ return fit_search_keys
1334
+
1305
1335
  def _select_features_by_psi(
1306
1336
  self,
1307
- trace_id: str,
1308
1337
  X: pd.DataFrame | pd.Series | np.ndarray,
1309
1338
  y: pd.DataFrame | pd.Series | np.ndarray | list,
1310
1339
  eval_set: list[tuple] | tuple | None,
@@ -1317,12 +1346,13 @@ class FeaturesEnricher(TransformerMixin):
1317
1346
  progress_callback: Callable | None = None,
1318
1347
  ):
1319
1348
  search_keys = self.search_keys.copy()
1320
- validated_X, _, validated_eval_set = self._validate_train_eval(X, y, eval_set, silent=True)
1349
+ search_keys.update(self._get_autodetected_search_keys())
1350
+ validated_X, _, validated_eval_set = self._validate_train_eval(X, y, eval_set)
1321
1351
  if isinstance(X, np.ndarray):
1322
1352
  search_keys = {str(k): v for k, v in search_keys.items()}
1323
1353
 
1324
1354
  date_column = self._get_date_column(search_keys)
1325
- has_date = date_column is not None
1355
+ has_date = date_column is not None and date_column in validated_X.columns
1326
1356
  if not has_date:
1327
1357
  self.logger.info("No date column for OOT PSI calculation")
1328
1358
  return
@@ -1352,7 +1382,6 @@ class FeaturesEnricher(TransformerMixin):
1352
1382
  ]
1353
1383
 
1354
1384
  prepared_data = self._get_cached_enriched_data(
1355
- trace_id=trace_id,
1356
1385
  X=X,
1357
1386
  y=y,
1358
1387
  eval_set=eval_set,
@@ -1417,13 +1446,11 @@ class FeaturesEnricher(TransformerMixin):
1417
1446
  # Find latest eval set or earliest if all eval sets are before train set
1418
1447
  date_column = self._get_date_column(search_keys)
1419
1448
 
1420
- date_converter = DateTimeSearchKeyConverter(
1449
+ date_converter = DateTimeConverter(
1421
1450
  date_column, self.date_format, self.logger, self.bundle, generate_cyclical_features=False
1422
1451
  )
1423
1452
 
1424
- X = date_converter.convert(X)
1425
-
1426
- x_date = X[date_column].dropna()
1453
+ x_date = date_converter.to_date_ms(X).dropna()
1427
1454
  if len(x_date) == 0:
1428
1455
  self.logger.warning("Empty date column in X")
1429
1456
  return []
@@ -1436,8 +1463,7 @@ class FeaturesEnricher(TransformerMixin):
1436
1463
  if date_column not in eval_x.columns:
1437
1464
  self.logger.warning(f"Date column not found in eval_set {i + 1}")
1438
1465
  continue
1439
- eval_x = date_converter.convert(eval_x)
1440
- eval_x_date = eval_x[date_column].dropna()
1466
+ eval_x_date = date_converter.to_date_ms(eval_x).dropna()
1441
1467
  if len(eval_x_date) < 1000:
1442
1468
  self.logger.warning(f"Eval_set {i} has less than 1000 rows. It will be ignored for stability check")
1443
1469
  continue
@@ -1474,8 +1500,7 @@ class FeaturesEnricher(TransformerMixin):
1474
1500
  )
1475
1501
  checking_eval_set_df = checking_eval_set_df.copy()
1476
1502
 
1477
- checking_eval_set_df[date_column] = eval_set_dates[selected_eval_set_idx]
1478
- checking_eval_set_df = date_converter.convert(checking_eval_set_df)
1503
+ checking_eval_set_df[date_column] = date_converter.to_date_ms(eval_set_dates[selected_eval_set_idx].to_frame())
1479
1504
 
1480
1505
  psi_values_sparse = calculate_sparsity_psi(
1481
1506
  checking_eval_set_df, cat_features, date_column, self.logger, model_task_type
@@ -1505,7 +1530,7 @@ class FeaturesEnricher(TransformerMixin):
1505
1530
 
1506
1531
  return total_unstable_features
1507
1532
 
1508
- def _update_shap_values(self, trace_id: str, df: pd.DataFrame, new_shaps: dict[str, float], silent: bool = False):
1533
+ def _update_shap_values(self, df: pd.DataFrame, new_shaps: dict[str, float], silent: bool = False):
1509
1534
  renaming = self.fit_columns_renaming or {}
1510
1535
  self.logger.info(f"Updating SHAP values: {new_shaps}")
1511
1536
  new_shaps = {
@@ -1513,7 +1538,7 @@ class FeaturesEnricher(TransformerMixin):
1513
1538
  for feature, shap in new_shaps.items()
1514
1539
  if feature in self.feature_names_ or renaming.get(feature, feature) in self.feature_names_
1515
1540
  }
1516
- self.__prepare_feature_importances(trace_id, df, new_shaps)
1541
+ self.__prepare_feature_importances(df, new_shaps)
1517
1542
 
1518
1543
  if not silent and self.features_info_display_handle is not None:
1519
1544
  try:
@@ -1640,7 +1665,7 @@ class FeaturesEnricher(TransformerMixin):
1640
1665
 
1641
1666
  if not isinstance(_cv, BaseCrossValidator):
1642
1667
  date_column = self._get_date_column(search_keys)
1643
- date_series = X[date_column] if date_column is not None else None
1668
+ date_series = X[date_column] if date_column is not None and date_column in X.columns else None
1644
1669
  _cv, groups = CVConfig(
1645
1670
  _cv, date_series, self.random_state, self._search_task.get_shuffle_kfold(), group_columns=group_columns
1646
1671
  ).get_cv_and_groups(X)
@@ -1693,7 +1718,6 @@ class FeaturesEnricher(TransformerMixin):
1693
1718
 
1694
1719
  def _get_cached_enriched_data(
1695
1720
  self,
1696
- trace_id: str,
1697
1721
  X: pd.DataFrame | pd.Series | np.ndarray | None = None,
1698
1722
  y: pd.DataFrame | pd.Series | np.ndarray | list | None = None,
1699
1723
  eval_set: list[tuple] | tuple | None = None,
@@ -1709,10 +1733,9 @@ class FeaturesEnricher(TransformerMixin):
1709
1733
  is_input_same_as_fit, X, y, eval_set = self._is_input_same_as_fit(X, y, eval_set)
1710
1734
  is_demo_dataset = hash_input(X, y, eval_set) in DEMO_DATASET_HASHES
1711
1735
  checked_eval_set = self._check_eval_set(eval_set, X)
1712
- validated_X, validated_y, validated_eval_set = self._validate_train_eval(X, y, checked_eval_set, silent=True)
1736
+ validated_X, validated_y, validated_eval_set = self._validate_train_eval(X, y, checked_eval_set)
1713
1737
 
1714
1738
  sampled_data = self._get_enriched_datasets(
1715
- trace_id=trace_id,
1716
1739
  validated_X=validated_X,
1717
1740
  validated_y=validated_y,
1718
1741
  eval_set=validated_eval_set,
@@ -1739,17 +1762,24 @@ class FeaturesEnricher(TransformerMixin):
1739
1762
 
1740
1763
  self.logger.info(f"Excluding search keys: {excluding_search_keys}")
1741
1764
 
1765
+ file_meta = self._search_task.get_file_metadata(self._get_trace_id())
1766
+ fit_dropped_features = self.fit_dropped_features or file_meta.droppedColumns or []
1767
+ original_dropped_features = [columns_renaming.get(f, f) for f in fit_dropped_features]
1768
+
1742
1769
  client_features = [
1743
1770
  c
1744
- for c in (validated_X.columns.to_list() + generated_features)
1771
+ for c in validated_X.columns.to_list()
1745
1772
  if (not self.fit_select_features or c in set(self.feature_names_).union(self.id_columns or []))
1746
1773
  and c
1747
1774
  not in (
1748
1775
  excluding_search_keys
1749
- + list(self.fit_dropped_features)
1750
- + [DateTimeSearchKeyConverter.DATETIME_COL, SYSTEM_RECORD_ID, ENTITY_SYSTEM_RECORD_ID]
1776
+ + original_dropped_features
1777
+ + [DateTimeConverter.DATETIME_COL, SYSTEM_RECORD_ID, ENTITY_SYSTEM_RECORD_ID]
1751
1778
  )
1752
1779
  ]
1780
+ client_features.extend(f for f in generated_features if f in self.feature_names_)
1781
+ if self.baseline_score_column is not None and self.baseline_score_column not in client_features:
1782
+ client_features.append(self.baseline_score_column)
1753
1783
  self.logger.info(f"Client features column on prepare data for metrics: {client_features}")
1754
1784
 
1755
1785
  selected_enriched_features = [c for c in self.feature_names_ if c not in client_features]
@@ -1848,7 +1878,7 @@ class FeaturesEnricher(TransformerMixin):
1848
1878
  enriched_eval_X_sorted, enriched_eval_y_sorted = self._sort_by_system_record_id(
1849
1879
  enriched_eval_X, eval_y_sampled, self.cv
1850
1880
  )
1851
- if date_column is not None:
1881
+ if date_column is not None and date_column in eval_X_sorted.columns:
1852
1882
  eval_set_dates[idx] = eval_X_sorted[date_column]
1853
1883
  fitting_eval_X = eval_X_sorted[fitting_x_columns].copy()
1854
1884
  fitting_enriched_eval_X = enriched_eval_X_sorted[fitting_enriched_x_columns].copy()
@@ -1909,7 +1939,6 @@ class FeaturesEnricher(TransformerMixin):
1909
1939
 
1910
1940
  def _get_enriched_datasets(
1911
1941
  self,
1912
- trace_id: str,
1913
1942
  validated_X: pd.DataFrame | pd.Series | np.ndarray | None,
1914
1943
  validated_y: pd.DataFrame | pd.Series | np.ndarray | list | None,
1915
1944
  eval_set: list[tuple] | None,
@@ -1937,7 +1966,7 @@ class FeaturesEnricher(TransformerMixin):
1937
1966
  and self.df_with_original_index is not None
1938
1967
  ):
1939
1968
  self.logger.info("Dataset is not imbalanced, so use enriched_X from fit")
1940
- return self.__get_enriched_from_fit(eval_set, trace_id, remove_outliers_calc_metrics)
1969
+ return self.__get_enriched_from_fit(validated_X, validated_y, eval_set, remove_outliers_calc_metrics)
1941
1970
  else:
1942
1971
  self.logger.info(
1943
1972
  "Dataset is imbalanced or exclude_features_sources or X was passed or this is saved search."
@@ -1949,7 +1978,6 @@ class FeaturesEnricher(TransformerMixin):
1949
1978
  validated_y,
1950
1979
  eval_set,
1951
1980
  exclude_features_sources,
1952
- trace_id,
1953
1981
  progress_bar,
1954
1982
  progress_callback,
1955
1983
  is_for_metrics=is_for_metrics,
@@ -1997,7 +2025,7 @@ class FeaturesEnricher(TransformerMixin):
1997
2025
  date_column = self._get_date_column(search_keys)
1998
2026
  generated_features = []
1999
2027
  if date_column is not None:
2000
- converter = DateTimeSearchKeyConverter(
2028
+ converter = DateTimeConverter(
2001
2029
  date_column,
2002
2030
  self.date_format,
2003
2031
  self.logger,
@@ -2006,6 +2034,7 @@ class FeaturesEnricher(TransformerMixin):
2006
2034
  )
2007
2035
  # Leave original date column values
2008
2036
  df_with_date_features = converter.convert(df, keep_time=True)
2037
+ # TODO check if this is correct
2009
2038
  df_with_date_features[date_column] = df[date_column]
2010
2039
  df = df_with_date_features
2011
2040
  generated_features = converter.generated_features
@@ -2036,15 +2065,17 @@ class FeaturesEnricher(TransformerMixin):
2036
2065
 
2037
2066
  # Sample after sorting by system_record_id for idempotency
2038
2067
  df.sort_values(by=SYSTEM_RECORD_ID, inplace=True)
2039
- df = self.__downsample_for_metrics(df)
2040
2068
 
2041
- if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
2042
- df = df.drop(columns=DateTimeSearchKeyConverter.DATETIME_COL)
2069
+ if DateTimeConverter.DATETIME_COL in df.columns:
2070
+ df = df.drop(columns=DateTimeConverter.DATETIME_COL)
2043
2071
 
2044
2072
  df = df.rename(columns=columns_renaming)
2045
2073
  generated_features = [columns_renaming.get(c, c) for c in generated_features]
2046
2074
  search_keys = {columns_renaming.get(k, k): v for k, v in search_keys.items()}
2047
2075
 
2076
+ # It uses original columns names!
2077
+ df = self.__downsample_for_metrics(df)
2078
+
2048
2079
  train_df = df.query(f"{EVAL_SET_INDEX} == 0") if eval_set is not None else df
2049
2080
  X_sampled = train_df.drop(columns=[TARGET, EVAL_SET_INDEX], errors="ignore")
2050
2081
  y_sampled = train_df[TARGET].copy()
@@ -2072,22 +2103,24 @@ class FeaturesEnricher(TransformerMixin):
2072
2103
 
2073
2104
  def __get_enriched_from_fit(
2074
2105
  self,
2106
+ validated_X: pd.DataFrame,
2107
+ validated_y: pd.Series,
2075
2108
  eval_set: list[tuple] | None,
2076
- trace_id: str,
2077
2109
  remove_outliers_calc_metrics: bool | None,
2078
2110
  ) -> _EnrichedDataForMetrics:
2079
2111
  eval_set_sampled_dict = {}
2080
2112
  search_keys = self.fit_search_keys.copy()
2081
2113
 
2082
2114
  rows_to_drop = None
2083
- has_date = self._get_date_column(search_keys) is not None
2115
+ date_column = self._get_date_column(search_keys)
2116
+ has_date = date_column is not None and date_column in validated_X.columns
2084
2117
  self.model_task_type = self.model_task_type or define_task(
2085
2118
  self.df_with_original_index[TARGET], has_date, self.logger, silent=True
2086
2119
  )
2087
2120
  if remove_outliers_calc_metrics is None:
2088
2121
  remove_outliers_calc_metrics = True
2089
2122
  if self.model_task_type == ModelTaskType.REGRESSION and remove_outliers_calc_metrics:
2090
- target_outliers_df = self._search_task.get_target_outliers(trace_id)
2123
+ target_outliers_df = self._search_task.get_target_outliers(self._get_trace_id())
2091
2124
  if target_outliers_df is not None and len(target_outliers_df) > 0:
2092
2125
  outliers = pd.merge(
2093
2126
  self.df_with_original_index,
@@ -2104,7 +2137,7 @@ class FeaturesEnricher(TransformerMixin):
2104
2137
 
2105
2138
  # index in each dataset (X, eval set) may be reordered and non unique, but index in validated datasets
2106
2139
  # can differs from it
2107
- fit_features = self._search_task.get_all_initial_raw_features(trace_id, metrics_calculation=True)
2140
+ fit_features = self._search_task.get_all_initial_raw_features(self._get_trace_id(), metrics_calculation=True)
2108
2141
 
2109
2142
  # Pre-process features if we need to drop outliers
2110
2143
  if rows_to_drop is not None:
@@ -2122,6 +2155,24 @@ class FeaturesEnricher(TransformerMixin):
2122
2155
  drop_system_record_id=False,
2123
2156
  )
2124
2157
 
2158
+ enriched_Xy.rename(columns=self.fit_columns_renaming, inplace=True)
2159
+ search_keys = {self.fit_columns_renaming.get(k, k): v for k, v in search_keys.items()}
2160
+ generated_features = [self.fit_columns_renaming.get(c, c) for c in self.fit_generated_features]
2161
+
2162
+ validated_Xy = validated_X.copy()
2163
+ validated_Xy[TARGET] = validated_y
2164
+
2165
+ selecting_columns = self._selecting_input_and_generated_columns(
2166
+ validated_Xy, self.fit_generated_features, keep_input=True
2167
+ )
2168
+ selecting_columns.extend(
2169
+ c
2170
+ for c in enriched_Xy.columns
2171
+ if (c in self.feature_names_ and c not in selecting_columns and c not in validated_X.columns)
2172
+ or c in [EVAL_SET_INDEX, ENTITY_SYSTEM_RECORD_ID, SYSTEM_RECORD_ID]
2173
+ )
2174
+ enriched_Xy = enriched_Xy[selecting_columns]
2175
+
2125
2176
  # Handle eval sets extraction based on EVAL_SET_INDEX
2126
2177
  if EVAL_SET_INDEX in enriched_Xy.columns:
2127
2178
  eval_set_indices = list(enriched_Xy[EVAL_SET_INDEX].unique())
@@ -2133,7 +2184,11 @@ class FeaturesEnricher(TransformerMixin):
2133
2184
  ].copy()
2134
2185
  enriched_Xy = enriched_Xy.loc[enriched_Xy[EVAL_SET_INDEX] == 0].copy()
2135
2186
 
2136
- x_columns = [c for c in self.df_with_original_index.columns if c not in [EVAL_SET_INDEX, TARGET]]
2187
+ x_columns = [
2188
+ c
2189
+ for c in [self.fit_columns_renaming.get(k, k) for k in self.df_with_original_index.columns]
2190
+ if c not in [EVAL_SET_INDEX, TARGET] and c in selecting_columns
2191
+ ]
2137
2192
  X_sampled = enriched_Xy[x_columns].copy()
2138
2193
  y_sampled = enriched_Xy[TARGET].copy()
2139
2194
  enriched_X = enriched_Xy.drop(columns=[TARGET, EVAL_SET_INDEX], errors="ignore")
@@ -2155,15 +2210,6 @@ class FeaturesEnricher(TransformerMixin):
2155
2210
  enriched_eval_X = enriched_eval_sets[idx + 1][enriched_X_columns].copy()
2156
2211
  eval_set_sampled_dict[idx] = (eval_X_sampled, enriched_eval_X, eval_y_sampled)
2157
2212
 
2158
- # reversed_renaming = {v: k for k, v in self.fit_columns_renaming.items()}
2159
- X_sampled.rename(columns=self.fit_columns_renaming, inplace=True)
2160
- enriched_X.rename(columns=self.fit_columns_renaming, inplace=True)
2161
- for _, (eval_X_sampled, enriched_eval_X, _) in eval_set_sampled_dict.items():
2162
- eval_X_sampled.rename(columns=self.fit_columns_renaming, inplace=True)
2163
- enriched_eval_X.rename(columns=self.fit_columns_renaming, inplace=True)
2164
- search_keys = {self.fit_columns_renaming.get(k, k): v for k, v in search_keys.items()}
2165
- generated_features = [self.fit_columns_renaming.get(c, c) for c in self.fit_generated_features]
2166
-
2167
2213
  datasets_hash = hash_input(self.X, self.y, self.eval_set)
2168
2214
  return self.__cache_and_return_results(
2169
2215
  datasets_hash,
@@ -2182,7 +2228,6 @@ class FeaturesEnricher(TransformerMixin):
2182
2228
  validated_y: pd.Series,
2183
2229
  eval_set: list[tuple] | None,
2184
2230
  exclude_features_sources: list[str] | None,
2185
- trace_id: str,
2186
2231
  progress_bar: ProgressBar | None,
2187
2232
  progress_callback: Callable[[SearchProgress], Any] | None,
2188
2233
  is_for_metrics: bool = False,
@@ -2208,7 +2253,6 @@ class FeaturesEnricher(TransformerMixin):
2208
2253
 
2209
2254
  # Transform
2210
2255
  enriched_df, columns_renaming, generated_features, search_keys = self.__inner_transform(
2211
- trace_id,
2212
2256
  X=df.drop(columns=[TARGET]),
2213
2257
  y=df[TARGET],
2214
2258
  exclude_features_sources=exclude_features_sources,
@@ -2385,11 +2429,10 @@ class FeaturesEnricher(TransformerMixin):
2385
2429
 
2386
2430
  return self.features_info
2387
2431
 
2388
- def get_progress(self, trace_id: str | None = None, search_task: SearchTask | None = None) -> SearchProgress:
2432
+ def get_progress(self, search_task: SearchTask | None = None) -> SearchProgress:
2389
2433
  search_task = search_task or self._search_task
2390
2434
  if search_task is not None:
2391
- trace_id = trace_id or uuid.uuid4()
2392
- return search_task.get_progress(trace_id)
2435
+ return search_task.get_progress(self._get_trace_id())
2393
2436
 
2394
2437
  def display_transactional_transform_api(self, only_online_sources=False):
2395
2438
  if self.api_key is None:
@@ -2416,7 +2459,7 @@ class FeaturesEnricher(TransformerMixin):
2416
2459
  return "12345678"
2417
2460
  return "test_value"
2418
2461
 
2419
- file_metadata = self._search_task.get_file_metadata(str(uuid.uuid4()))
2462
+ file_metadata = self._search_task.get_file_metadata(time.time_ns())
2420
2463
 
2421
2464
  def get_column_meta(column_name: str) -> FileColumnMetadata:
2422
2465
  for c in file_metadata.columns:
@@ -2490,7 +2533,6 @@ if response.status_code == 200:
2490
2533
 
2491
2534
  def __inner_transform(
2492
2535
  self,
2493
- trace_id: str,
2494
2536
  X: pd.DataFrame,
2495
2537
  *,
2496
2538
  y: pd.Series | None = None,
@@ -2509,174 +2551,133 @@ if response.status_code == 200:
2509
2551
  raise NotFittedError(self.bundle.get("transform_unfitted_enricher"))
2510
2552
 
2511
2553
  start_time = time.time()
2512
- search_id = self.search_id or (self._search_task.search_task_id if self._search_task is not None else None)
2513
- with MDC(trace_id=trace_id, search_id=search_id):
2514
- self.logger.info("Start transform")
2554
+ self.logger.info("Start transform")
2515
2555
 
2516
- validated_X, validated_y, validated_eval_set = self._validate_train_eval(
2517
- X, y, eval_set=None, is_transform=True, silent=True
2518
- )
2519
- df = self.__combine_train_and_eval_sets(validated_X, validated_y, validated_eval_set)
2556
+ search_keys = self.search_keys.copy()
2520
2557
 
2521
- validated_Xy = df.copy()
2558
+ self.__validate_search_keys(search_keys, self.search_id)
2522
2559
 
2523
- self.__log_debug_information(validated_X, validated_y, exclude_features_sources=exclude_features_sources)
2560
+ validated_X, validated_y, validated_eval_set = self._validate_train_eval(X, y, eval_set=None, is_transform=True)
2561
+ df = self.__combine_train_and_eval_sets(validated_X, validated_y, validated_eval_set)
2524
2562
 
2525
- # If there are no important features, return original dataframe
2526
- if len(self.feature_names_) == 0:
2527
- msg = self.bundle.get("no_important_features_for_transform")
2528
- self.__log_warning(msg, show_support_link=True)
2529
- return None, {}, [], self.search_keys
2563
+ validated_Xy = df.copy()
2530
2564
 
2531
- self.__validate_search_keys(self.search_keys, self.search_id)
2565
+ self.__log_debug_information(validated_X, validated_y, exclude_features_sources=exclude_features_sources)
2532
2566
 
2533
- if self._has_paid_features(exclude_features_sources):
2534
- msg = self.bundle.get("transform_with_paid_features")
2535
- self.logger.warning(msg)
2536
- self.__display_support_link(msg)
2537
- return None, {}, [], self.search_keys
2567
+ # If there are no important features, return original dataframe
2568
+ if len(self.feature_names_) == 0:
2569
+ msg = self.bundle.get("no_important_features_for_transform")
2570
+ self.__log_warning(msg, show_support_link=True)
2571
+ return None, {}, [], search_keys
2538
2572
 
2539
- online_api_features = [fm.name for fm in features_meta if fm.from_online_api and fm.shap_value > 0]
2540
- if len(online_api_features) > 0:
2541
- self.logger.warning(
2542
- f"There are important features for transform, that generated by online API: {online_api_features}"
2543
- )
2544
- msg = self.bundle.get("online_api_features_transform").format(online_api_features)
2545
- self.logger.warning(msg)
2546
- print(msg)
2547
- self.display_transactional_transform_api(only_online_sources=True)
2548
-
2549
- if not metrics_calculation:
2550
- transform_usage = self.rest_client.get_current_transform_usage(trace_id)
2551
- self.logger.info(f"Current transform usage: {transform_usage}. Transforming {len(X)} rows")
2552
- if transform_usage.has_limit:
2553
- if len(X) > transform_usage.rest_rows:
2554
- rest_rows = max(transform_usage.rest_rows, 0)
2555
- msg = self.bundle.get("transform_usage_warning").format(len(X), rest_rows)
2556
- self.logger.warning(msg)
2557
- print(msg)
2558
- show_request_quote_button()
2559
- return None, {}, [], {}
2560
- else:
2561
- msg = self.bundle.get("transform_usage_info").format(
2562
- transform_usage.limit, transform_usage.transformed_rows
2563
- )
2564
- self.logger.info(msg)
2565
- print(msg)
2573
+ if self._has_paid_features(exclude_features_sources):
2574
+ msg = self.bundle.get("transform_with_paid_features")
2575
+ self.logger.warning(msg)
2576
+ self.__display_support_link(msg)
2577
+ return None, {}, [], search_keys
2566
2578
 
2567
- is_demo_dataset = hash_input(df) in DEMO_DATASET_HASHES
2579
+ online_api_features = [fm.name for fm in features_meta if fm.from_online_api and fm.shap_value > 0]
2580
+ if len(online_api_features) > 0:
2581
+ self.logger.warning(
2582
+ f"There are important features for transform, that generated by online API: {online_api_features}"
2583
+ )
2584
+ msg = self.bundle.get("online_api_features_transform").format(online_api_features)
2585
+ self.logger.warning(msg)
2586
+ print(msg)
2587
+ self.display_transactional_transform_api(only_online_sources=True)
2588
+
2589
+ if not metrics_calculation:
2590
+ transform_usage = self.rest_client.get_current_transform_usage(self._get_trace_id())
2591
+ self.logger.info(f"Current transform usage: {transform_usage}. Transforming {len(X)} rows")
2592
+ if transform_usage.has_limit:
2593
+ if len(X) > transform_usage.rest_rows:
2594
+ rest_rows = max(transform_usage.rest_rows, 0)
2595
+ bundle_msg = (
2596
+ "transform_usage_warning_registered" if self.__is_registered else "transform_usage_warning_demo"
2597
+ )
2598
+ msg = self.bundle.get(bundle_msg).format(rest_rows, len(X))
2599
+ self.logger.warning(msg)
2600
+ print(msg)
2601
+ show_request_quote_button(is_registered=self.__is_registered)
2602
+ return None, {}, [], {}
2603
+ else:
2604
+ msg = self.bundle.get("transform_usage_info").format(
2605
+ transform_usage.limit, transform_usage.transformed_rows
2606
+ )
2607
+ self.logger.info(msg)
2608
+ print(msg)
2568
2609
 
2569
- columns_to_drop = [
2570
- c for c in df.columns if c in self.feature_names_ and c in self.external_source_feature_names
2571
- ]
2572
- if len(columns_to_drop) > 0:
2573
- msg = self.bundle.get("x_contains_enriching_columns").format(columns_to_drop)
2574
- self.logger.warning(msg)
2575
- print(msg)
2576
- df = df.drop(columns=columns_to_drop)
2610
+ is_demo_dataset = hash_input(df) in DEMO_DATASET_HASHES
2577
2611
 
2578
- search_keys = self.search_keys.copy()
2579
- if self.id_columns is not None and self.cv is not None and self.cv.is_time_series():
2580
- search_keys.update(
2581
- {col: SearchKey.CUSTOM_KEY for col in self.id_columns if col not in self.search_keys}
2582
- )
2612
+ columns_to_drop = [
2613
+ c for c in df.columns if c in self.feature_names_ and c in self.external_source_feature_names
2614
+ ]
2615
+ if len(columns_to_drop) > 0:
2616
+ msg = self.bundle.get("x_contains_enriching_columns").format(columns_to_drop)
2617
+ self.logger.warning(msg)
2618
+ print(msg)
2619
+ df = df.drop(columns=columns_to_drop)
2583
2620
 
2584
- search_keys = self.__prepare_search_keys(
2585
- df, search_keys, is_demo_dataset, is_transform=True, silent_mode=silent_mode
2586
- )
2621
+ if self.id_columns is not None and self.cv is not None and self.cv.is_time_series():
2622
+ search_keys.update({col: SearchKey.CUSTOM_KEY for col in self.id_columns if col not in search_keys})
2587
2623
 
2588
- df = self.__handle_index_search_keys(df, search_keys)
2624
+ search_keys = self.__prepare_search_keys(
2625
+ df, search_keys, is_demo_dataset, is_transform=True, silent_mode=silent_mode
2626
+ )
2589
2627
 
2590
- if DEFAULT_INDEX in df.columns:
2591
- msg = self.bundle.get("unsupported_index_column")
2592
- self.logger.info(msg)
2593
- print(msg)
2594
- df.drop(columns=DEFAULT_INDEX, inplace=True)
2595
- validated_Xy.drop(columns=DEFAULT_INDEX, inplace=True)
2628
+ df = self.__handle_index_search_keys(df, search_keys)
2596
2629
 
2597
- df = self.__add_country_code(df, search_keys)
2630
+ if DEFAULT_INDEX in df.columns:
2631
+ msg = self.bundle.get("unsupported_index_column")
2632
+ self.logger.info(msg)
2633
+ print(msg)
2634
+ df.drop(columns=DEFAULT_INDEX, inplace=True)
2635
+ validated_Xy.drop(columns=DEFAULT_INDEX, inplace=True)
2598
2636
 
2599
- generated_features = []
2600
- date_column = self._get_date_column(search_keys)
2601
- if date_column is not None:
2602
- converter = DateTimeSearchKeyConverter(
2603
- date_column,
2604
- self.date_format,
2605
- self.logger,
2606
- bundle=self.bundle,
2607
- generate_cyclical_features=self.generate_search_key_features,
2608
- )
2609
- df = converter.convert(df, keep_time=True)
2610
- self.logger.info(f"Date column after convertion: {df[date_column]}")
2611
- generated_features.extend(converter.generated_features)
2612
- else:
2613
- self.logger.info("Input dataset hasn't date column")
2614
- if self.__should_add_date_column():
2615
- df = self._add_current_date_as_key(df, search_keys, self.bundle, silent=True)
2616
-
2617
- email_columns = SearchKey.find_all_keys(search_keys, SearchKey.EMAIL)
2618
- if email_columns and self.generate_search_key_features:
2619
- generator = EmailDomainGenerator(email_columns)
2620
- df = generator.generate(df)
2621
- generated_features.extend(generator.generated_features)
2622
-
2623
- normalizer = Normalizer(self.bundle, self.logger)
2624
- df, search_keys, generated_features = normalizer.normalize(df, search_keys, generated_features)
2625
- columns_renaming = normalizer.columns_renaming
2626
-
2627
- # If there are no external features, we don't call backend on transform
2628
- external_features = [fm for fm in features_meta if fm.shap_value > 0 and fm.source != "etalon"]
2629
- if len(external_features) == 0:
2630
- self.logger.warning(
2631
- "No external features found, returning original dataframe"
2632
- f" with generated important features: {self.feature_names_}"
2633
- )
2634
- df = df.rename(columns=columns_renaming)
2635
- generated_features = [columns_renaming.get(c, c) for c in generated_features]
2636
- search_keys = {columns_renaming.get(c, c): t for c, t in search_keys.items()}
2637
- selecting_columns = self._selecting_input_and_generated_columns(
2638
- validated_Xy, generated_features, keep_input, trace_id
2639
- )
2640
- self.logger.warning(f"Filtered columns by existance in dataframe: {selecting_columns}")
2641
- if add_fit_system_record_id:
2642
- df = self._add_fit_system_record_id(
2643
- df,
2644
- search_keys,
2645
- SYSTEM_RECORD_ID,
2646
- TARGET,
2647
- columns_renaming,
2648
- self.id_columns,
2649
- self.cv,
2650
- self.model_task_type,
2651
- self.logger,
2652
- self.bundle,
2653
- )
2654
- selecting_columns.append(SYSTEM_RECORD_ID)
2655
- return df[selecting_columns], columns_renaming, generated_features, search_keys
2656
-
2657
- # Don't pass all features in backend on transform
2658
- runtime_parameters = self._get_copy_of_runtime_parameters()
2659
- features_for_transform = self._search_task.get_features_for_transform() or []
2660
- if len(features_for_transform) > 0:
2661
- missing_features_for_transform = [
2662
- columns_renaming.get(f) or f for f in features_for_transform if f not in df.columns
2663
- ]
2664
- if TARGET in missing_features_for_transform:
2665
- raise ValidationError(self.bundle.get("missing_target_for_transform"))
2637
+ df = self.__add_country_code(df, search_keys)
2666
2638
 
2667
- if len(missing_features_for_transform) > 0:
2668
- raise ValidationError(
2669
- self.bundle.get("missing_features_for_transform").format(missing_features_for_transform)
2670
- )
2671
- runtime_parameters.properties["features_for_embeddings"] = ",".join(features_for_transform)
2639
+ generated_features = []
2640
+ date_column = self._get_date_column(search_keys)
2641
+ if date_column is not None:
2642
+ converter = DateTimeConverter(
2643
+ date_column,
2644
+ self.date_format,
2645
+ self.logger,
2646
+ bundle=self.bundle,
2647
+ generate_cyclical_features=self.generate_search_key_features,
2648
+ )
2649
+ df = converter.convert(df, keep_time=True)
2650
+ self.logger.info(f"Date column after convertion: {df[date_column]}")
2651
+ generated_features.extend(converter.generated_features)
2652
+ else:
2653
+ self.logger.info("Input dataset hasn't date column")
2654
+ if self.__should_add_date_column():
2655
+ df = self._add_current_date_as_key(df, search_keys, self.bundle, silent=True)
2672
2656
 
2673
- columns_for_system_record_id = sorted(list(search_keys.keys()) + features_for_transform)
2657
+ email_columns = SearchKey.find_all_keys(search_keys, SearchKey.EMAIL)
2658
+ if email_columns and self.generate_search_key_features:
2659
+ generator = EmailDomainGenerator(email_columns)
2660
+ df = generator.generate(df)
2661
+ generated_features.extend(generator.generated_features)
2674
2662
 
2675
- df[ENTITY_SYSTEM_RECORD_ID] = pd.util.hash_pandas_object(
2676
- df[columns_for_system_record_id], index=False
2677
- ).astype("float64")
2663
+ normalizer = Normalizer(self.bundle, self.logger)
2664
+ df, search_keys, generated_features = normalizer.normalize(df, search_keys, generated_features)
2665
+ columns_renaming = normalizer.columns_renaming
2678
2666
 
2679
- features_not_to_pass = []
2667
+ # If there are no external features, we don't call backend on transform
2668
+ external_features = [fm for fm in features_meta if fm.shap_value > 0 and fm.source != "etalon"]
2669
+ if len(external_features) == 0:
2670
+ self.logger.warning(
2671
+ "No external features found, returning original dataframe"
2672
+ f" with generated important features: {self.feature_names_}"
2673
+ )
2674
+ df = df.rename(columns=columns_renaming)
2675
+ generated_features = [columns_renaming.get(c, c) for c in generated_features]
2676
+ search_keys = {columns_renaming.get(c, c): t for c, t in search_keys.items()}
2677
+ selecting_columns = self._selecting_input_and_generated_columns(
2678
+ validated_Xy, generated_features, keep_input, is_transform=True
2679
+ )
2680
+ self.logger.warning(f"Filtered columns by existance in dataframe: {selecting_columns}")
2680
2681
  if add_fit_system_record_id:
2681
2682
  df = self._add_fit_system_record_id(
2682
2683
  df,
@@ -2690,86 +2691,144 @@ if response.status_code == 200:
2690
2691
  self.logger,
2691
2692
  self.bundle,
2692
2693
  )
2693
- df = df.rename(columns={SYSTEM_RECORD_ID: SORT_ID})
2694
- features_not_to_pass.append(SORT_ID)
2694
+ selecting_columns.append(SYSTEM_RECORD_ID)
2695
+ return df[selecting_columns], columns_renaming, generated_features, search_keys
2695
2696
 
2696
- system_columns_with_original_index = [ENTITY_SYSTEM_RECORD_ID] + generated_features
2697
- if add_fit_system_record_id:
2698
- system_columns_with_original_index.append(SORT_ID)
2697
+ # Don't pass all features in backend on transform
2698
+ runtime_parameters = self._get_copy_of_runtime_parameters()
2699
+ features_for_transform = self._search_task.get_features_for_transform()
2700
+ if features_for_transform:
2701
+ missing_features_for_transform = [
2702
+ columns_renaming.get(f) or f for f in features_for_transform if f not in df.columns
2703
+ ]
2704
+ if TARGET in missing_features_for_transform:
2705
+ raise ValidationError(self.bundle.get("missing_target_for_transform"))
2699
2706
 
2700
- df_before_explode = df[system_columns_with_original_index].copy()
2707
+ if len(missing_features_for_transform) > 0:
2708
+ raise ValidationError(
2709
+ self.bundle.get("missing_features_for_transform").format(missing_features_for_transform)
2710
+ )
2711
+ features_for_embeddings = self._search_task.get_features_for_embeddings()
2712
+ if features_for_embeddings:
2713
+ runtime_parameters.properties["features_for_embeddings"] = ",".join(features_for_embeddings)
2714
+ features_for_transform = [f for f in features_for_transform if f not in search_keys.keys()]
2701
2715
 
2702
- # Explode multiple search keys
2703
- df, unnest_search_keys = self._explode_multiple_search_keys(df, search_keys, columns_renaming)
2716
+ columns_for_system_record_id = sorted(list(search_keys.keys()) + features_for_transform)
2704
2717
 
2705
- # Convert search keys and generate features on them
2718
+ df[ENTITY_SYSTEM_RECORD_ID] = pd.util.hash_pandas_object(df[columns_for_system_record_id], index=False).astype(
2719
+ "float64"
2720
+ )
2706
2721
 
2707
- email_column = self._get_email_column(search_keys)
2708
- hem_column = self._get_hem_column(search_keys)
2709
- if email_column:
2710
- converter = EmailSearchKeyConverter(
2711
- email_column,
2712
- hem_column,
2713
- search_keys,
2714
- columns_renaming,
2715
- list(unnest_search_keys.keys()),
2716
- self.logger,
2717
- )
2718
- df = converter.convert(df)
2722
+ features_not_to_pass = []
2723
+ if add_fit_system_record_id:
2724
+ df = self._add_fit_system_record_id(
2725
+ df,
2726
+ search_keys,
2727
+ SYSTEM_RECORD_ID,
2728
+ TARGET,
2729
+ columns_renaming,
2730
+ self.id_columns,
2731
+ self.cv,
2732
+ self.model_task_type,
2733
+ self.logger,
2734
+ self.bundle,
2735
+ )
2736
+ df = df.rename(columns={SYSTEM_RECORD_ID: SORT_ID})
2737
+ features_not_to_pass.append(SORT_ID)
2719
2738
 
2720
- ip_column = self._get_ip_column(search_keys)
2721
- if ip_column:
2722
- converter = IpSearchKeyConverter(
2723
- ip_column,
2724
- search_keys,
2725
- columns_renaming,
2726
- list(unnest_search_keys.keys()),
2727
- self.bundle,
2728
- self.logger,
2729
- )
2730
- df = converter.convert(df)
2739
+ system_columns_with_original_index = [ENTITY_SYSTEM_RECORD_ID] + generated_features
2740
+ if add_fit_system_record_id:
2741
+ system_columns_with_original_index.append(SORT_ID)
2731
2742
 
2732
- meaning_types = {}
2733
- meaning_types.update({col: FileColumnMeaningType.FEATURE for col in features_for_transform})
2734
- meaning_types.update({col: key.value for col, key in search_keys.items()})
2743
+ df_before_explode = df[system_columns_with_original_index].copy()
2735
2744
 
2736
- features_not_to_pass.extend(
2737
- [
2738
- c
2739
- for c in df.columns
2740
- if c not in search_keys.keys()
2741
- and c not in features_for_transform
2742
- and c not in [ENTITY_SYSTEM_RECORD_ID, SEARCH_KEY_UNNEST]
2743
- ]
2744
- )
2745
+ # Explode multiple search keys
2746
+ df, unnest_search_keys = self._explode_multiple_search_keys(df, search_keys, columns_renaming)
2745
2747
 
2746
- if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
2747
- df = df.drop(columns=DateTimeSearchKeyConverter.DATETIME_COL)
2748
+ # Convert search keys and generate features on them
2748
2749
 
2749
- # search keys might be changed after explode
2750
- columns_for_system_record_id = sorted(list(search_keys.keys()) + features_for_transform)
2751
- df[SYSTEM_RECORD_ID] = pd.util.hash_pandas_object(df[columns_for_system_record_id], index=False).astype(
2752
- "float64"
2750
+ email_column = self._get_email_column(search_keys)
2751
+ hem_column = self._get_hem_column(search_keys)
2752
+ if email_column:
2753
+ converter = EmailSearchKeyConverter(
2754
+ email_column,
2755
+ hem_column,
2756
+ search_keys,
2757
+ columns_renaming,
2758
+ list(unnest_search_keys.keys()),
2759
+ self.logger,
2753
2760
  )
2754
- meaning_types[SYSTEM_RECORD_ID] = FileColumnMeaningType.SYSTEM_RECORD_ID
2755
- meaning_types[ENTITY_SYSTEM_RECORD_ID] = FileColumnMeaningType.ENTITY_SYSTEM_RECORD_ID
2756
- if SEARCH_KEY_UNNEST in df.columns:
2757
- meaning_types[SEARCH_KEY_UNNEST] = FileColumnMeaningType.UNNEST_KEY
2761
+ df = converter.convert(df)
2758
2762
 
2759
- df = df.reset_index(drop=True)
2763
+ ip_column = self._get_ip_column(search_keys)
2764
+ if ip_column:
2765
+ converter = IpSearchKeyConverter(
2766
+ ip_column,
2767
+ search_keys,
2768
+ columns_renaming,
2769
+ list(unnest_search_keys.keys()),
2770
+ self.bundle,
2771
+ self.logger,
2772
+ )
2773
+ df = converter.convert(df)
2760
2774
 
2761
- combined_search_keys = combine_search_keys(search_keys.keys())
2775
+ date_features = []
2776
+ for col in features_for_transform:
2777
+ if DateTimeConverter(col).is_datetime(df):
2778
+ df[col] = DateTimeConverter(col).to_date_string(df)
2779
+ date_features.append(col)
2780
+
2781
+ meaning_types = {}
2782
+ meaning_types.update(
2783
+ {
2784
+ col: FileColumnMeaningType.FEATURE
2785
+ for col in features_for_transform
2786
+ if col not in date_features and col not in generated_features
2787
+ }
2788
+ )
2789
+ meaning_types.update({col: FileColumnMeaningType.GENERATED_FEATURE for col in generated_features})
2790
+ meaning_types.update({col: FileColumnMeaningType.DATE_FEATURE for col in date_features})
2791
+ meaning_types.update({col: key.value for col, key in search_keys.items()})
2762
2792
 
2763
- df_without_features = df.drop(columns=features_not_to_pass, errors="ignore")
2793
+ features_not_to_pass.extend(
2794
+ [
2795
+ c
2796
+ for c in df.columns
2797
+ if c not in search_keys.keys()
2798
+ and c not in features_for_transform
2799
+ and c not in [ENTITY_SYSTEM_RECORD_ID, SEARCH_KEY_UNNEST]
2800
+ ]
2801
+ )
2764
2802
 
2765
- df_without_features, full_duplicates_warning = clean_full_duplicates(
2766
- df_without_features, self.logger, bundle=self.bundle
2767
- )
2768
- if not silent_mode and full_duplicates_warning:
2769
- self.__log_warning(full_duplicates_warning)
2803
+ if DateTimeConverter.DATETIME_COL in df.columns:
2804
+ df = df.drop(columns=DateTimeConverter.DATETIME_COL)
2770
2805
 
2771
- del df
2772
- gc.collect()
2806
+ # search keys might be changed after explode
2807
+ columns_for_system_record_id = sorted(list(search_keys.keys()) + features_for_transform)
2808
+ df[SYSTEM_RECORD_ID] = pd.util.hash_pandas_object(df[columns_for_system_record_id], index=False).astype(
2809
+ "float64"
2810
+ )
2811
+ meaning_types[SYSTEM_RECORD_ID] = FileColumnMeaningType.SYSTEM_RECORD_ID
2812
+ meaning_types[ENTITY_SYSTEM_RECORD_ID] = FileColumnMeaningType.ENTITY_SYSTEM_RECORD_ID
2813
+ if SEARCH_KEY_UNNEST in df.columns:
2814
+ meaning_types[SEARCH_KEY_UNNEST] = FileColumnMeaningType.UNNEST_KEY
2815
+
2816
+ df = df.reset_index(drop=True)
2817
+
2818
+ combined_search_keys = combine_search_keys(search_keys.keys())
2819
+
2820
+ df_without_features = df.drop(columns=features_not_to_pass, errors="ignore")
2821
+
2822
+ df_without_features, full_duplicates_warning = clean_full_duplicates(
2823
+ df_without_features, is_transform=True, logger=self.logger, bundle=self.bundle
2824
+ )
2825
+ if not silent_mode and full_duplicates_warning:
2826
+ self.__log_warning(full_duplicates_warning)
2827
+
2828
+ del df
2829
+ gc.collect()
2830
+
2831
+ def invoke_validation(df: pd.DataFrame):
2773
2832
 
2774
2833
  dataset = Dataset(
2775
2834
  "sample_" + str(uuid.uuid4()),
@@ -2789,7 +2848,7 @@ if response.status_code == 200:
2789
2848
  dataset.columns_renaming = columns_renaming
2790
2849
 
2791
2850
  validation_task = self._search_task.validation(
2792
- trace_id,
2851
+ self._get_trace_id(),
2793
2852
  dataset,
2794
2853
  start_time=start_time,
2795
2854
  extract_features=True,
@@ -2801,7 +2860,7 @@ if response.status_code == 200:
2801
2860
  progress_callback=progress_callback,
2802
2861
  )
2803
2862
 
2804
- del df_without_features, dataset
2863
+ del df, dataset
2805
2864
  gc.collect()
2806
2865
 
2807
2866
  if not silent_mode:
@@ -2809,7 +2868,7 @@ if response.status_code == 200:
2809
2868
  if not self.__is_registered:
2810
2869
  print(self.bundle.get("polling_unregister_information"))
2811
2870
 
2812
- progress = self.get_progress(trace_id, validation_task)
2871
+ progress = self.get_progress(validation_task)
2813
2872
  progress.recalculate_eta(time.time() - start_time)
2814
2873
  if progress_bar is not None:
2815
2874
  progress_bar.progress = progress.to_progress_bar()
@@ -2831,15 +2890,15 @@ if response.status_code == 200:
2831
2890
  if progress.stage == ProgressStage.FAILED.value:
2832
2891
  raise Exception(progress.error_message)
2833
2892
  time.sleep(polling_period_seconds)
2834
- progress = self.get_progress(trace_id, validation_task)
2893
+ progress = self.get_progress(validation_task)
2835
2894
  except KeyboardInterrupt as e:
2836
2895
  print(self.bundle.get("search_stopping"))
2837
- self.rest_client.stop_search_task_v2(trace_id, validation_task.search_task_id)
2896
+ self.rest_client.stop_search_task_v2(self._get_trace_id(), validation_task.search_task_id)
2838
2897
  self.logger.warning(f"Search {validation_task.search_task_id} stopped by user")
2839
2898
  print(self.bundle.get("search_stopped"))
2840
2899
  raise e
2841
2900
 
2842
- validation_task.poll_result(trace_id, quiet=True)
2901
+ validation_task.poll_result(self._get_trace_id(), quiet=True)
2843
2902
 
2844
2903
  seconds_left = time.time() - start_time
2845
2904
  progress = SearchProgress(97.0, ProgressStage.DOWNLOADING, seconds_left)
@@ -2851,96 +2910,118 @@ if response.status_code == 200:
2851
2910
  if not silent_mode:
2852
2911
  print(self.bundle.get("transform_start"))
2853
2912
 
2854
- # Prepare input DataFrame for __enrich by concatenating generated ids and client features
2855
- df_before_explode = df_before_explode.rename(columns=columns_renaming)
2856
- generated_features = [columns_renaming.get(c, c) for c in generated_features]
2857
- combined_df = pd.concat(
2858
- [
2859
- validated_Xy.reset_index(drop=True),
2860
- df_before_explode.reset_index(drop=True),
2861
- ],
2862
- axis=1,
2863
- ).set_index(validated_Xy.index)
2864
-
2865
- result_features = validation_task.get_all_validation_raw_features(trace_id, metrics_calculation)
2866
-
2867
- result = self.__enrich(
2868
- combined_df,
2869
- result_features,
2870
- how="left",
2871
- )
2913
+ return validation_task.get_all_validation_raw_features(self._get_trace_id(), metrics_calculation)
2872
2914
 
2873
- selecting_columns = self._selecting_input_and_generated_columns(
2874
- validated_Xy, generated_features, keep_input, trace_id
2875
- )
2876
- selecting_columns.extend(
2877
- c
2878
- for c in result.columns
2879
- if c in self.feature_names_ and c not in selecting_columns and c not in validated_Xy.columns
2915
+ if len(df_without_features) <= Dataset.MAX_ROWS:
2916
+ result_features = invoke_validation(df_without_features)
2917
+ else:
2918
+ self.logger.warning(
2919
+ f"Dataset has more than {Dataset.MAX_ROWS} rows, splitting into chunks of {Dataset.MAX_ROWS} rows"
2880
2920
  )
2881
- if add_fit_system_record_id:
2882
- selecting_columns.append(SORT_ID)
2921
+ result_features_list = []
2883
2922
 
2884
- selecting_columns = list(set(selecting_columns))
2885
- # sorting: first columns from X, then generated features, then enriched features
2886
- sorted_selecting_columns = [c for c in validated_Xy.columns if c in selecting_columns]
2887
- for c in generated_features:
2888
- if c in selecting_columns and c not in sorted_selecting_columns:
2889
- sorted_selecting_columns.append(c)
2890
- for c in result.columns:
2891
- if c in selecting_columns and c not in sorted_selecting_columns:
2892
- sorted_selecting_columns.append(c)
2923
+ for i in range(0, len(df_without_features), Dataset.MAX_ROWS):
2924
+ chunk = df_without_features.iloc[i:i+Dataset.MAX_ROWS]
2925
+ result_features_list.append(invoke_validation(chunk))
2926
+ result_features = pd.concat(result_features_list)
2893
2927
 
2894
- self.logger.info(f"Transform sorted_selecting_columns: {sorted_selecting_columns}")
2928
+ # Prepare input DataFrame for __enrich by concatenating generated ids and client features
2929
+ df_before_explode = df_before_explode.rename(columns=columns_renaming)
2930
+ generated_features = [columns_renaming.get(c, c) for c in generated_features]
2931
+ combined_df = pd.concat(
2932
+ [
2933
+ validated_Xy.reset_index(drop=True),
2934
+ df_before_explode.reset_index(drop=True),
2935
+ ],
2936
+ axis=1,
2937
+ ).set_index(validated_Xy.index)
2938
+
2939
+ result = self.__enrich(
2940
+ combined_df,
2941
+ result_features,
2942
+ how="left",
2943
+ )
2895
2944
 
2896
- result = result[sorted_selecting_columns]
2945
+ selecting_columns = self._selecting_input_and_generated_columns(
2946
+ validated_Xy, generated_features, keep_input, is_transform=True
2947
+ )
2948
+ selecting_columns.extend(
2949
+ c
2950
+ for c in result.columns
2951
+ if c in self.feature_names_ and c not in selecting_columns and c not in validated_Xy.columns
2952
+ )
2953
+ if add_fit_system_record_id:
2954
+ selecting_columns.append(SORT_ID)
2897
2955
 
2898
- if self.country_added:
2899
- result = result.drop(columns=COUNTRY, errors="ignore")
2956
+ selecting_columns = list(set(selecting_columns))
2957
+ # sorting: first columns from X, then generated features, then enriched features
2958
+ sorted_selecting_columns = [c for c in validated_Xy.columns if c in selecting_columns]
2959
+ for c in generated_features:
2960
+ if c in selecting_columns and c not in sorted_selecting_columns:
2961
+ sorted_selecting_columns.append(c)
2962
+ for c in result.columns:
2963
+ if c in selecting_columns and c not in sorted_selecting_columns:
2964
+ sorted_selecting_columns.append(c)
2900
2965
 
2901
- if add_fit_system_record_id:
2902
- result = result.rename(columns={SORT_ID: SYSTEM_RECORD_ID})
2966
+ self.logger.info(f"Transform sorted_selecting_columns: {sorted_selecting_columns}")
2903
2967
 
2904
- return result, columns_renaming, generated_features, search_keys
2968
+ result = result[sorted_selecting_columns]
2969
+
2970
+ if self.country_added:
2971
+ result = result.drop(columns=COUNTRY, errors="ignore")
2972
+
2973
+ if add_fit_system_record_id:
2974
+ result = result.rename(columns={SORT_ID: SYSTEM_RECORD_ID})
2975
+
2976
+ return result, columns_renaming, generated_features, search_keys
2905
2977
 
2906
2978
  def _selecting_input_and_generated_columns(
2907
2979
  self,
2908
2980
  validated_Xy: pd.DataFrame,
2909
2981
  generated_features: list[str],
2910
2982
  keep_input: bool,
2911
- trace_id: str,
2983
+ is_transform: bool = False,
2912
2984
  ):
2913
- fit_input_columns = [c.originalName for c in self._search_task.get_file_metadata(trace_id).columns]
2914
- new_columns_on_transform = [c for c in validated_Xy.columns if c not in fit_input_columns]
2915
-
2916
- selected_generated_features = [
2917
- c for c in generated_features if not self.fit_select_features or c in self.feature_names_
2985
+ file_meta = self._search_task.get_file_metadata(self._get_trace_id())
2986
+ fit_dropped_features = self.fit_dropped_features or file_meta.droppedColumns or []
2987
+ fit_input_columns = [c.originalName for c in file_meta.columns]
2988
+ original_dropped_features = [self.fit_columns_renaming.get(c, c) for c in fit_dropped_features]
2989
+ new_columns_on_transform = [
2990
+ c for c in validated_Xy.columns if c not in fit_input_columns and c not in original_dropped_features
2918
2991
  ]
2992
+ fit_original_search_keys = self._get_fit_search_keys_with_original_names()
2993
+
2994
+ selected_generated_features = [c for c in generated_features if c in self.feature_names_]
2919
2995
  if keep_input is True:
2920
2996
  selected_input_columns = [
2921
2997
  c
2922
2998
  for c in validated_Xy.columns
2923
2999
  if not self.fit_select_features
2924
3000
  or c in self.feature_names_
2925
- or c in new_columns_on_transform
2926
- or c in self.search_keys
3001
+ or (c in new_columns_on_transform and is_transform)
3002
+ or c in fit_original_search_keys
2927
3003
  or c in (self.id_columns or [])
2928
3004
  or c in [EVAL_SET_INDEX, TARGET] # transform for metrics calculation
3005
+ or c == self.baseline_score_column
2929
3006
  ]
2930
3007
  else:
2931
3008
  selected_input_columns = []
2932
3009
 
3010
+ if DEFAULT_INDEX in selected_input_columns:
3011
+ selected_input_columns.remove(DEFAULT_INDEX)
3012
+
2933
3013
  return selected_input_columns + selected_generated_features
2934
3014
 
2935
- def __validate_search_keys(self, search_keys: dict[str, SearchKey], search_id: str | None = None):
3015
+ def _validate_empty_search_keys(self, search_keys: dict[str, SearchKey], is_transform: bool = False):
2936
3016
  if (search_keys is None or len(search_keys) == 0) and self.country_code is None:
2937
- if search_id:
2938
- self.logger.debug(f"search_id {search_id} provided without search_keys")
2939
- return
3017
+ if is_transform:
3018
+ self.logger.debug("Transform started without search_keys")
3019
+ # return
2940
3020
  else:
2941
3021
  self.logger.warning("search_keys not provided")
2942
- raise ValidationError(self.bundle.get("empty_search_keys"))
3022
+ # raise ValidationError(self.bundle.get("empty_search_keys"))
2943
3023
 
3024
+ def __validate_search_keys(self, search_keys: dict[str, SearchKey], search_id: str | None = None):
2944
3025
  key_types = search_keys.values()
2945
3026
 
2946
3027
  # Multiple search keys allowed only for PHONE, IP, POSTAL_CODE, EMAIL, HEM
@@ -3004,7 +3085,6 @@ if response.status_code == 200:
3004
3085
 
3005
3086
  def __inner_fit(
3006
3087
  self,
3007
- trace_id: str,
3008
3088
  X: pd.DataFrame | pd.Series | np.ndarray,
3009
3089
  y: pd.DataFrame | pd.Series | np.ndarray | list | None,
3010
3090
  eval_set: list[tuple] | None,
@@ -3086,8 +3166,10 @@ if response.status_code == 200:
3086
3166
  df = self.__handle_index_search_keys(df, self.fit_search_keys)
3087
3167
  self.fit_search_keys = self.__prepare_search_keys(df, self.fit_search_keys, is_demo_dataset)
3088
3168
 
3169
+ df = self._validate_OOT(df, self.fit_search_keys)
3170
+
3089
3171
  maybe_date_column = SearchKey.find_key(self.fit_search_keys, [SearchKey.DATE, SearchKey.DATETIME])
3090
- has_date = maybe_date_column is not None
3172
+ has_date = maybe_date_column is not None and maybe_date_column in validated_X.columns
3091
3173
 
3092
3174
  self.model_task_type = self.model_task_type or define_task(validated_y, has_date, self.logger)
3093
3175
 
@@ -3114,7 +3196,7 @@ if response.status_code == 200:
3114
3196
 
3115
3197
  if DEFAULT_INDEX in df.columns:
3116
3198
  msg = self.bundle.get("unsupported_index_column")
3117
- self.logger.info(msg)
3199
+ self.logger.warning(msg)
3118
3200
  print(msg)
3119
3201
  self.fit_dropped_features.add(DEFAULT_INDEX)
3120
3202
  df.drop(columns=DEFAULT_INDEX, inplace=True)
@@ -3124,7 +3206,7 @@ if response.status_code == 200:
3124
3206
  self.fit_generated_features = []
3125
3207
 
3126
3208
  if has_date:
3127
- converter = DateTimeSearchKeyConverter(
3209
+ converter = DateTimeConverter(
3128
3210
  maybe_date_column,
3129
3211
  self.date_format,
3130
3212
  self.logger,
@@ -3170,15 +3252,19 @@ if response.status_code == 200:
3170
3252
  df, self.fit_search_keys, self.fit_generated_features
3171
3253
  )
3172
3254
  self.fit_columns_renaming = normalizer.columns_renaming
3173
- if normalizer.removed_features:
3174
- self.__log_warning(self.bundle.get("dataset_date_features").format(normalizer.removed_features))
3255
+ if normalizer.removed_datetime_features:
3256
+ self.fit_dropped_features.update(normalizer.removed_datetime_features)
3257
+ original_removed_datetime_features = [
3258
+ self.fit_columns_renaming.get(f, f) for f in normalizer.removed_datetime_features
3259
+ ]
3260
+ self.__log_warning(self.bundle.get("dataset_date_features").format(original_removed_datetime_features))
3175
3261
 
3176
3262
  non_feature_columns = [
3177
3263
  self.TARGET_NAME,
3178
3264
  EVAL_SET_INDEX,
3179
3265
  ] + list(self.fit_search_keys.keys())
3180
- if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
3181
- non_feature_columns.append(DateTimeSearchKeyConverter.DATETIME_COL)
3266
+ if DateTimeConverter.DATETIME_COL in df.columns:
3267
+ non_feature_columns.append(DateTimeConverter.DATETIME_COL)
3182
3268
 
3183
3269
  features_columns = [c for c in df.columns if c not in non_feature_columns]
3184
3270
 
@@ -3220,7 +3306,7 @@ if response.status_code == 200:
3220
3306
  if fintech_warnings:
3221
3307
  for fintech_warning in fintech_warnings:
3222
3308
  self.__log_warning(fintech_warning)
3223
- df, full_duplicates_warning = clean_full_duplicates(df, self.logger, bundle=self.bundle)
3309
+ df, full_duplicates_warning = clean_full_duplicates(df, logger=self.logger, bundle=self.bundle)
3224
3310
  if full_duplicates_warning:
3225
3311
  if len(df) == 0:
3226
3312
  raise ValidationError(full_duplicates_warning)
@@ -3265,15 +3351,28 @@ if response.status_code == 200:
3265
3351
  ENTITY_SYSTEM_RECORD_ID,
3266
3352
  SEARCH_KEY_UNNEST,
3267
3353
  ] + list(self.fit_search_keys.keys())
3268
- if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
3269
- non_feature_columns.append(DateTimeSearchKeyConverter.DATETIME_COL)
3354
+ if DateTimeConverter.DATETIME_COL in df.columns:
3355
+ non_feature_columns.append(DateTimeConverter.DATETIME_COL)
3270
3356
 
3271
3357
  features_columns = [c for c in df.columns if c not in non_feature_columns]
3272
3358
 
3359
+ # find date features
3360
+ date_features = []
3361
+ for col in features_columns:
3362
+ if DateTimeConverter(col).is_datetime(df):
3363
+ df[col] = DateTimeConverter(col).to_date_string(df)
3364
+ date_features.append(col)
3365
+
3273
3366
  meaning_types = {
3274
3367
  **{col: key.value for col, key in self.fit_search_keys.items()},
3275
- **{str(c): FileColumnMeaningType.FEATURE for c in df.columns if c not in non_feature_columns},
3368
+ **{
3369
+ str(c): FileColumnMeaningType.FEATURE
3370
+ for c in df.columns
3371
+ if c not in non_feature_columns and c not in date_features and c not in self.fit_generated_features
3372
+ },
3276
3373
  }
3374
+ meaning_types.update({col: FileColumnMeaningType.GENERATED_FEATURE for col in self.fit_generated_features})
3375
+ meaning_types.update({col: FileColumnMeaningType.DATE_FEATURE for col in date_features})
3277
3376
  meaning_types[self.TARGET_NAME] = FileColumnMeaningType.TARGET
3278
3377
  meaning_types[ENTITY_SYSTEM_RECORD_ID] = FileColumnMeaningType.ENTITY_SYSTEM_RECORD_ID
3279
3378
  if SEARCH_KEY_UNNEST in df.columns:
@@ -3294,8 +3393,8 @@ if response.status_code == 200:
3294
3393
  self.bundle,
3295
3394
  )
3296
3395
 
3297
- if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
3298
- df = df.drop(columns=DateTimeSearchKeyConverter.DATETIME_COL)
3396
+ if DateTimeConverter.DATETIME_COL in df.columns:
3397
+ df = df.drop(columns=DateTimeConverter.DATETIME_COL)
3299
3398
 
3300
3399
  meaning_types[SYSTEM_RECORD_ID] = FileColumnMeaningType.SYSTEM_RECORD_ID
3301
3400
 
@@ -3320,6 +3419,8 @@ if response.status_code == 200:
3320
3419
  cv_type=self.cv,
3321
3420
  id_columns=self.__get_renamed_id_columns(),
3322
3421
  is_imbalanced=self.imbalanced,
3422
+ dropped_columns=[self.fit_columns_renaming.get(f, f) for f in self.fit_dropped_features],
3423
+ autodetected_search_keys=self.autodetected_search_keys,
3323
3424
  date_column=self._get_date_column(self.fit_search_keys),
3324
3425
  date_format=self.date_format,
3325
3426
  random_state=self.random_state,
@@ -3332,11 +3433,18 @@ if response.status_code == 200:
3332
3433
  dataset.columns_renaming = self.fit_columns_renaming
3333
3434
 
3334
3435
  self.passed_features = [
3335
- column for column, meaning_type in meaning_types.items() if meaning_type == FileColumnMeaningType.FEATURE
3436
+ column
3437
+ for column, meaning_type in meaning_types.items()
3438
+ if meaning_type
3439
+ in [
3440
+ FileColumnMeaningType.FEATURE,
3441
+ FileColumnMeaningType.DATE_FEATURE,
3442
+ FileColumnMeaningType.GENERATED_FEATURE,
3443
+ ]
3336
3444
  ]
3337
3445
 
3338
3446
  self._search_task = dataset.search(
3339
- trace_id=trace_id,
3447
+ trace_id=self._get_trace_id(),
3340
3448
  progress_bar=progress_bar,
3341
3449
  start_time=start_time,
3342
3450
  progress_callback=progress_callback,
@@ -3356,7 +3464,7 @@ if response.status_code == 200:
3356
3464
  if not self.__is_registered:
3357
3465
  print(self.bundle.get("polling_unregister_information"))
3358
3466
 
3359
- progress = self.get_progress(trace_id)
3467
+ progress = self.get_progress()
3360
3468
  prev_progress = None
3361
3469
  progress.recalculate_eta(time.time() - start_time)
3362
3470
  if progress_bar is not None:
@@ -3382,16 +3490,16 @@ if response.status_code == 200:
3382
3490
  )
3383
3491
  raise RuntimeError(self.bundle.get("search_task_failed_status"))
3384
3492
  time.sleep(poll_period_seconds)
3385
- progress = self.get_progress(trace_id)
3493
+ progress = self.get_progress()
3386
3494
  except KeyboardInterrupt as e:
3387
3495
  print(self.bundle.get("search_stopping"))
3388
- self.rest_client.stop_search_task_v2(trace_id, self._search_task.search_task_id)
3389
- self._search_task = None
3496
+ self.rest_client.stop_search_task_v2(self._get_trace_id(), self._search_task.search_task_id)
3390
3497
  self.logger.warning(f"Search {self._search_task.search_task_id} stopped by user")
3498
+ self._search_task = None
3391
3499
  print(self.bundle.get("search_stopped"))
3392
3500
  raise e
3393
3501
 
3394
- self._search_task.poll_result(trace_id, quiet=True)
3502
+ self._search_task.poll_result(self._get_trace_id(), quiet=True)
3395
3503
 
3396
3504
  seconds_left = time.time() - start_time
3397
3505
  progress = SearchProgress(97.0, ProgressStage.GENERATING_REPORT, seconds_left)
@@ -3420,10 +3528,9 @@ if response.status_code == 200:
3420
3528
  msg = self.bundle.get("features_not_generated").format(unused_features_for_generation)
3421
3529
  self.__log_warning(msg)
3422
3530
 
3423
- self.__prepare_feature_importances(trace_id, df)
3531
+ self.__prepare_feature_importances(df)
3424
3532
 
3425
3533
  self._select_features_by_psi(
3426
- trace_id=trace_id,
3427
3534
  X=X,
3428
3535
  y=y,
3429
3536
  eval_set=eval_set,
@@ -3436,7 +3543,7 @@ if response.status_code == 200:
3436
3543
  progress_callback=progress_callback,
3437
3544
  )
3438
3545
 
3439
- self.__prepare_feature_importances(trace_id, df)
3546
+ self.__prepare_feature_importances(df)
3440
3547
 
3441
3548
  self.__show_selected_features()
3442
3549
 
@@ -3471,7 +3578,6 @@ if response.status_code == 200:
3471
3578
  scoring,
3472
3579
  estimator,
3473
3580
  remove_outliers_calc_metrics,
3474
- trace_id,
3475
3581
  progress_bar,
3476
3582
  progress_callback,
3477
3583
  )
@@ -3557,7 +3663,8 @@ if response.status_code == 200:
3557
3663
  keys.append("EMAIL")
3558
3664
  if "DATE" in keys:
3559
3665
  keys.append("DATETIME")
3560
- search_keys_with_autodetection = {**self.search_keys, **self.autodetected_search_keys}
3666
+ autodetected_search_keys = self.autodetected_search_keys or {}
3667
+ search_keys_with_autodetection = {**self.search_keys, **autodetected_search_keys}
3561
3668
  return [c for c, v in search_keys_with_autodetection.items() if v.value.value in keys]
3562
3669
 
3563
3670
  def _validate_train_eval(
@@ -3566,11 +3673,10 @@ if response.status_code == 200:
3566
3673
  y: pd.Series | None = None,
3567
3674
  eval_set: list[tuple[pd.DataFrame, pd.Series]] | None = None,
3568
3675
  is_transform: bool = False,
3569
- silent: bool = False,
3570
3676
  ) -> tuple[pd.DataFrame, pd.Series, list[tuple[pd.DataFrame, pd.Series]]] | None:
3571
3677
  validated_X = self._validate_X(X, is_transform)
3572
3678
  validated_y = self._validate_y(validated_X, y, enforce_y=not is_transform)
3573
- validated_eval_set = self._validate_eval_set(validated_X, eval_set, silent=silent)
3679
+ validated_eval_set = self._validate_eval_set(validated_X, eval_set)
3574
3680
  return validated_X, validated_y, validated_eval_set
3575
3681
 
3576
3682
  def _encode_id_columns(
@@ -3696,30 +3802,41 @@ if response.status_code == 200:
3696
3802
  return validated_y
3697
3803
 
3698
3804
  def _validate_eval_set(
3699
- self, X: pd.DataFrame, eval_set: list[tuple[pd.DataFrame, pd.Series]] | None, silent: bool = False
3700
- ):
3805
+ self,
3806
+ X: pd.DataFrame,
3807
+ eval_set: list[tuple[pd.DataFrame, pd.Series]] | None,
3808
+ ) -> list[tuple[pd.DataFrame, pd.Series]] | None:
3701
3809
  if eval_set is None:
3702
3810
  return None
3703
3811
  validated_eval_set = []
3704
- has_date = self._get_date_column(self.search_keys) is not None
3705
- for idx, eval_pair in enumerate(eval_set):
3812
+ for _, eval_pair in enumerate(eval_set):
3706
3813
  validated_pair = self._validate_eval_set_pair(X, eval_pair)
3707
- if validated_pair[1].isna().all():
3708
- if not has_date:
3709
- msg = self.bundle.get("oot_without_date_not_supported").format(idx + 1)
3710
- elif self.columns_for_online_api:
3711
- msg = self.bundle.get("oot_with_online_sources_not_supported").format(idx + 1)
3712
- else:
3713
- msg = None
3714
- if msg:
3715
- if not silent:
3716
- print(msg)
3717
- self.logger.warning(msg)
3718
- continue
3719
3814
  validated_eval_set.append(validated_pair)
3720
3815
 
3721
3816
  return validated_eval_set
3722
3817
 
3818
+ def _validate_OOT(self, df: pd.DataFrame, search_keys: dict[str, SearchKey]) -> pd.DataFrame:
3819
+ if EVAL_SET_INDEX not in df.columns:
3820
+ return df
3821
+
3822
+ for eval_set_index in df[EVAL_SET_INDEX].unique():
3823
+ if eval_set_index == 0:
3824
+ continue
3825
+ eval_df = df[df[EVAL_SET_INDEX] == eval_set_index]
3826
+ date_col = self._get_date_column(search_keys)
3827
+ has_date = date_col is not None and date_col in eval_df.columns
3828
+ if eval_df[TARGET].isna().all():
3829
+ msg = None
3830
+ if not has_date:
3831
+ msg = self.bundle.get("oot_without_date_not_supported").format(eval_set_index)
3832
+ elif self.columns_for_online_api:
3833
+ msg = self.bundle.get("oot_with_online_sources_not_supported").format(eval_set_index)
3834
+ if msg:
3835
+ print(msg)
3836
+ self.logger.warning(msg)
3837
+ df = df[df[EVAL_SET_INDEX] != eval_set_index]
3838
+ return df
3839
+
3723
3840
  def _validate_eval_set_pair(self, X: pd.DataFrame, eval_pair: tuple) -> tuple[pd.DataFrame, pd.Series]:
3724
3841
  if len(eval_pair) != 2:
3725
3842
  raise ValidationError(self.bundle.get("eval_set_invalid_tuple_size").format(len(eval_pair)))
@@ -3860,8 +3977,8 @@ if response.status_code == 200:
3860
3977
  X = Xy.drop(columns=TARGET)
3861
3978
  y = Xy[TARGET].copy()
3862
3979
 
3863
- if DateTimeSearchKeyConverter.DATETIME_COL in X.columns:
3864
- X.drop(columns=DateTimeSearchKeyConverter.DATETIME_COL, inplace=True)
3980
+ if DateTimeConverter.DATETIME_COL in X.columns:
3981
+ X.drop(columns=DateTimeConverter.DATETIME_COL, inplace=True)
3865
3982
 
3866
3983
  return X, y
3867
3984
 
@@ -3871,8 +3988,8 @@ if response.status_code == 200:
3871
3988
  X: pd.DataFrame, y: pd.Series, search_keys: dict[str, SearchKey], cv: CVType | None
3872
3989
  ) -> tuple[pd.DataFrame, pd.Series]:
3873
3990
  if cv not in [CVType.time_series, CVType.blocked_time_series]:
3874
- if DateTimeSearchKeyConverter.DATETIME_COL in X.columns:
3875
- date_column = DateTimeSearchKeyConverter.DATETIME_COL
3991
+ if DateTimeConverter.DATETIME_COL in X.columns:
3992
+ date_column = DateTimeConverter.DATETIME_COL
3876
3993
  else:
3877
3994
  date_column = FeaturesEnricher._get_date_column(search_keys)
3878
3995
  sort_columns = [date_column] if date_column is not None else []
@@ -3900,8 +4017,8 @@ if response.status_code == 200:
3900
4017
 
3901
4018
  y = Xy[TARGET].copy()
3902
4019
 
3903
- if DateTimeSearchKeyConverter.DATETIME_COL in X.columns:
3904
- X.drop(columns=DateTimeSearchKeyConverter.DATETIME_COL, inplace=True)
4020
+ if DateTimeConverter.DATETIME_COL in X.columns:
4021
+ X.drop(columns=DateTimeConverter.DATETIME_COL, inplace=True)
3905
4022
 
3906
4023
  return X, y
3907
4024
 
@@ -3980,12 +4097,10 @@ if response.status_code == 200:
3980
4097
  maybe_date_col = SearchKey.find_key(self.search_keys, [SearchKey.DATE, SearchKey.DATETIME])
3981
4098
  if X is not None and maybe_date_col is not None and maybe_date_col in X.columns:
3982
4099
  # TODO cast date column to single dtype
3983
- date_converter = DateTimeSearchKeyConverter(
3984
- maybe_date_col, self.date_format, generate_cyclical_features=False
3985
- )
3986
- converted_X = date_converter.convert(X)
3987
- min_date = converted_X[maybe_date_col].min()
3988
- max_date = converted_X[maybe_date_col].max()
4100
+ date_converter = DateTimeConverter(maybe_date_col, self.date_format, generate_cyclical_features=False)
4101
+ date_col_values = date_converter.to_date_ms(X)
4102
+ min_date = date_col_values.min()
4103
+ max_date = date_col_values.max()
3989
4104
  self.logger.info(f"Dates interval is ({min_date}, {max_date})")
3990
4105
 
3991
4106
  except Exception:
@@ -4017,12 +4132,14 @@ if response.status_code == 200:
4017
4132
  or set(search_keys.values()) == {SearchKey.EMAIL}
4018
4133
  or set(search_keys.values()) == {SearchKey.HEM}
4019
4134
  or set(search_keys.values()) == {SearchKey.COUNTRY, SearchKey.POSTAL_CODE}
4135
+ or len(search_keys) == 0
4136
+ or set(search_keys.values()) == {SearchKey.CUSTOM_KEY}
4020
4137
  ):
4021
4138
  if not silent:
4022
4139
  self.__log_warning(bundle.get("current_date_added"))
4023
- df[FeaturesEnricher.CURRENT_DATE] = datetime.date.today()
4024
- search_keys[FeaturesEnricher.CURRENT_DATE] = SearchKey.DATE
4025
- converter = DateTimeSearchKeyConverter(FeaturesEnricher.CURRENT_DATE, generate_cyclical_features=False)
4140
+ df[CURRENT_DATE_COL] = datetime.date.today()
4141
+ search_keys[CURRENT_DATE_COL] = SearchKey.DATE
4142
+ converter = DateTimeConverter(CURRENT_DATE_COL, generate_cyclical_features=False)
4026
4143
  df = converter.convert(df)
4027
4144
  return df
4028
4145
 
@@ -4036,7 +4153,7 @@ if response.status_code == 200:
4036
4153
  return [
4037
4154
  col
4038
4155
  for col, t in search_keys.items()
4039
- if t not in [SearchKey.DATE, SearchKey.DATETIME] and df[col].dropna().nunique() > 1
4156
+ if t not in [SearchKey.DATE, SearchKey.DATETIME] and col in df.columns and df[col].dropna().nunique() > 1
4040
4157
  ]
4041
4158
 
4042
4159
  @staticmethod
@@ -4153,8 +4270,8 @@ if response.status_code == 200:
4153
4270
  "__target",
4154
4271
  ENTITY_SYSTEM_RECORD_ID,
4155
4272
  ]
4156
- if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
4157
- date_column = DateTimeSearchKeyConverter.DATETIME_COL
4273
+ if DateTimeConverter.DATETIME_COL in df.columns:
4274
+ date_column = DateTimeConverter.DATETIME_COL
4158
4275
  sort_exclude_columns.append(FeaturesEnricher._get_date_column(search_keys))
4159
4276
  else:
4160
4277
  date_column = FeaturesEnricher._get_date_column(search_keys)
@@ -4335,47 +4452,6 @@ if response.status_code == 200:
4335
4452
 
4336
4453
  return result_features
4337
4454
 
4338
- def __get_features_importance_from_server(self, trace_id: str, df: pd.DataFrame):
4339
- if self._search_task is None:
4340
- raise NotFittedError(self.bundle.get("transform_unfitted_enricher"))
4341
- features_meta = self._search_task.get_all_features_metadata_v2()
4342
- if features_meta is None:
4343
- raise Exception(self.bundle.get("missing_features_meta"))
4344
- features_meta = deepcopy(features_meta)
4345
-
4346
- original_names_dict = {c.name: c.originalName for c in self._search_task.get_file_metadata(trace_id).columns}
4347
- df = df.rename(columns=original_names_dict)
4348
-
4349
- features_meta.sort(key=lambda m: (-m.shap_value, m.name))
4350
-
4351
- importances = {}
4352
-
4353
- for feature_meta in features_meta:
4354
- if feature_meta.name in original_names_dict.keys():
4355
- feature_meta.name = original_names_dict[feature_meta.name]
4356
-
4357
- is_client_feature = feature_meta.name in df.columns
4358
-
4359
- if feature_meta.shap_value == 0.0:
4360
- continue
4361
-
4362
- # Use only important features
4363
- if (
4364
- feature_meta.name == COUNTRY
4365
- # In select_features mode we select also from etalon features and need to show them
4366
- or (not self.fit_select_features and is_client_feature)
4367
- ):
4368
- continue
4369
-
4370
- # Temporary workaround for duplicate features metadata
4371
- if feature_meta.name in importances:
4372
- self.logger.warning(f"WARNING: Duplicate feature metadata: {feature_meta}")
4373
- continue
4374
-
4375
- importances[feature_meta.name] = feature_meta.shap_value
4376
-
4377
- return importances
4378
-
4379
4455
  def __get_categorical_features(self) -> list[str]:
4380
4456
  features_meta = self._search_task.get_all_features_metadata_v2()
4381
4457
  if features_meta is None:
@@ -4385,7 +4461,6 @@ if response.status_code == 200:
4385
4461
 
4386
4462
  def __prepare_feature_importances(
4387
4463
  self,
4388
- trace_id: str,
4389
4464
  clients_features_df: pd.DataFrame,
4390
4465
  updated_shaps: dict[str, float] | None = None,
4391
4466
  update_selected_features: bool = True,
@@ -4393,14 +4468,16 @@ if response.status_code == 200:
4393
4468
  ):
4394
4469
  if self._search_task is None:
4395
4470
  raise NotFittedError(self.bundle.get("transform_unfitted_enricher"))
4396
- selected_features = self._search_task.get_selected_features(trace_id)
4471
+ selected_features = self._search_task.get_selected_features(self._get_trace_id())
4397
4472
  features_meta = self._search_task.get_all_features_metadata_v2()
4398
4473
  if features_meta is None:
4399
4474
  raise Exception(self.bundle.get("missing_features_meta"))
4400
4475
  features_meta = deepcopy(features_meta)
4401
4476
 
4402
- original_names_dict = {c.name: c.originalName for c in self._search_task.get_file_metadata(trace_id).columns}
4403
- features_df = self._search_task.get_all_initial_raw_features(trace_id, metrics_calculation=True)
4477
+ file_metadata_columns = self._search_task.get_file_metadata(self._get_trace_id()).columns
4478
+ file_meta_by_orig_name = {c.originalName: c for c in file_metadata_columns}
4479
+ original_names_dict = {c.name: c.originalName for c in file_metadata_columns}
4480
+ features_df = self._search_task.get_all_initial_raw_features(self._get_trace_id(), metrics_calculation=True)
4404
4481
 
4405
4482
  # To be sure that names with hash suffixes
4406
4483
  clients_features_df = clients_features_df.rename(columns=original_names_dict)
@@ -4419,10 +4496,13 @@ if response.status_code == 200:
4419
4496
  original_name = original_names_dict.get(feature_meta.name, feature_meta.name)
4420
4497
  feature_meta.name = original_name
4421
4498
 
4422
- is_client_feature = original_name in clients_features_df.columns
4499
+ file_meta = file_meta_by_orig_name.get(original_name)
4500
+ is_generated_feature = (
4501
+ file_meta is not None and file_meta.meaningType == FileColumnMeaningType.GENERATED_FEATURE
4502
+ )
4503
+ is_client_feature = original_name in clients_features_df.columns and not is_generated_feature
4423
4504
 
4424
4505
  if selected_features is not None and feature_meta.name not in selected_features:
4425
- self.logger.info(f"Feature {feature_meta.name} is not selected before and skipped")
4426
4506
  continue
4427
4507
 
4428
4508
  selected_features_meta.append(feature_meta)
@@ -4442,9 +4522,13 @@ if response.status_code == 200:
4442
4522
 
4443
4523
  for feature_meta in selected_features_meta:
4444
4524
  original_name = original_names_dict.get(feature_meta.name, feature_meta.name)
4445
- is_client_feature = original_name in clients_features_df.columns
4525
+ file_meta = file_meta_by_orig_name.get(original_name)
4526
+ is_generated_feature = (
4527
+ file_meta is not None and file_meta.meaningType == FileColumnMeaningType.GENERATED_FEATURE
4528
+ )
4529
+ is_client_feature = original_name in clients_features_df.columns and not is_generated_feature
4446
4530
 
4447
- if not is_client_feature:
4531
+ if not is_client_feature and not is_generated_feature:
4448
4532
  self.external_source_feature_names.append(original_name)
4449
4533
 
4450
4534
  if self.psi_values is not None:
@@ -4475,20 +4559,21 @@ if response.status_code == 200:
4475
4559
 
4476
4560
  self.feature_names_.append(feature_meta.name)
4477
4561
  self.feature_importances_.append(_round_shap_value(feature_meta.shap_value))
4478
-
4479
4562
  df_for_sample = features_df if feature_meta.name in features_df.columns else clients_features_df
4480
- feature_info = FeatureInfo.from_metadata(feature_meta, df_for_sample, is_client_feature)
4563
+ feature_info = FeatureInfo.from_metadata(
4564
+ feature_meta, df_for_sample, is_client_feature, is_generated_feature
4565
+ )
4481
4566
  features_info.append(feature_info.to_row(self.bundle))
4482
4567
  features_info_without_links.append(feature_info.to_row_without_links(self.bundle))
4483
4568
  internal_features_info.append(feature_info.to_internal_row(self.bundle))
4484
4569
 
4485
4570
  if update_selected_features:
4486
- self._search_task.update_selected_features(trace_id, self.feature_names_)
4571
+ self._search_task.update_selected_features(self._get_trace_id(), self.feature_names_)
4487
4572
 
4488
4573
  if len(features_info) > 0:
4489
4574
  self.features_info = pd.DataFrame(features_info)
4490
4575
  # If all psi values are 0 or null, drop psi column
4491
- if self.features_info[self.bundle.get("features_info_psi")].fillna(0.0).eq(0.0).all():
4576
+ if self.features_info[self.bundle.get("features_info_psi")].astype(np.float64).fillna(0.0).eq(0.0).all():
4492
4577
  self.features_info.drop(columns=[self.bundle.get("features_info_psi")], inplace=True)
4493
4578
  self._features_info_without_links = pd.DataFrame(features_info_without_links)
4494
4579
  self._internal_features_info = pd.DataFrame(internal_features_info)
@@ -4681,12 +4766,17 @@ if response.status_code == 200:
4681
4766
  ):
4682
4767
  raise ValidationError(self.bundle.get("empty_search_key").format(column_name))
4683
4768
 
4684
- if self.autodetect_search_keys and (
4685
- not is_transform or set(valid_search_keys.values()) != set(self.fit_search_keys.values())
4686
- ):
4687
- valid_search_keys = self.__detect_missing_search_keys(
4688
- x, valid_search_keys, is_demo_dataset, silent_mode, is_transform
4689
- )
4769
+ if is_transform:
4770
+ fit_autodetected_search_keys = self._get_autodetected_search_keys()
4771
+ if fit_autodetected_search_keys is not None:
4772
+ for key in fit_autodetected_search_keys.keys():
4773
+ if key not in x.columns:
4774
+ raise ValidationError(
4775
+ self.bundle.get("autodetected_search_key_not_found").format(key, x.columns)
4776
+ )
4777
+ valid_search_keys.update(fit_autodetected_search_keys)
4778
+ elif self.autodetect_search_keys:
4779
+ valid_search_keys = self.__detect_missing_search_keys(x, valid_search_keys, is_demo_dataset)
4690
4780
 
4691
4781
  if all(k == SearchKey.CUSTOM_KEY for k in valid_search_keys.values()):
4692
4782
  if self.__is_registered:
@@ -4694,7 +4784,8 @@ if response.status_code == 200:
4694
4784
  else:
4695
4785
  msg = self.bundle.get("unregistered_only_personal_keys")
4696
4786
  self.logger.warning(msg + f" Provided search keys: {search_keys}")
4697
- raise ValidationError(msg)
4787
+ # Current date will be added later
4788
+ # raise ValidationError(msg)
4698
4789
 
4699
4790
  if (
4700
4791
  len(valid_search_keys.values()) == 1
@@ -4708,7 +4799,7 @@ if response.status_code == 200:
4708
4799
  maybe_date = [k for k, v in valid_search_keys.items() if v in [SearchKey.DATE, SearchKey.DATETIME]]
4709
4800
  if (self.cv is None or self.cv == CVType.k_fold) and len(maybe_date) > 0 and not silent_mode:
4710
4801
  date_column = next(iter(maybe_date))
4711
- if x[date_column].nunique() > 0.9 * _num_samples(x):
4802
+ if x[date_column].nunique() > 0.9 * _num_samples(x) and not is_transform:
4712
4803
  msg = self.bundle.get("date_search_without_time_series")
4713
4804
  self.__log_warning(msg)
4714
4805
 
@@ -4723,6 +4814,8 @@ if response.status_code == 200:
4723
4814
 
4724
4815
  self.logger.info(f"Prepared search keys: {valid_search_keys}")
4725
4816
 
4817
+ # x = self._validate_empty_search_keys(x, valid_search_keys, is_transform=is_transform)
4818
+
4726
4819
  return valid_search_keys
4727
4820
 
4728
4821
  def __show_metrics(
@@ -4730,7 +4823,6 @@ if response.status_code == 200:
4730
4823
  scoring: Callable | str | None,
4731
4824
  estimator: Any | None,
4732
4825
  remove_outliers_calc_metrics: bool | None,
4733
- trace_id: str,
4734
4826
  progress_bar: ProgressBar | None = None,
4735
4827
  progress_callback: Callable[[SearchProgress], Any] | None = None,
4736
4828
  ):
@@ -4738,7 +4830,6 @@ if response.status_code == 200:
4738
4830
  scoring=scoring,
4739
4831
  estimator=estimator,
4740
4832
  remove_outliers_calc_metrics=remove_outliers_calc_metrics,
4741
- trace_id=trace_id,
4742
4833
  internal_call=True,
4743
4834
  progress_bar=progress_bar,
4744
4835
  progress_callback=progress_callback,
@@ -4803,80 +4894,67 @@ if response.status_code == 200:
4803
4894
  df: pd.DataFrame,
4804
4895
  search_keys: dict[str, SearchKey],
4805
4896
  is_demo_dataset: bool,
4806
- silent_mode=False,
4807
- is_transform=False,
4808
4897
  ) -> dict[str, SearchKey]:
4809
4898
  sample = df.head(100)
4810
4899
 
4811
- def check_need_detect(search_key: SearchKey):
4812
- return not is_transform or (
4813
- search_key in self.fit_search_keys.values() and search_key not in search_keys.values()
4814
- )
4815
-
4816
- # if SearchKey.POSTAL_CODE not in search_keys.values() and check_need_detect(SearchKey.POSTAL_CODE):
4817
- if check_need_detect(SearchKey.POSTAL_CODE):
4818
- maybe_keys = PostalCodeSearchKeyDetector().get_search_key_columns(sample, search_keys)
4819
- if maybe_keys:
4820
- new_keys = {key: SearchKey.POSTAL_CODE for key in maybe_keys}
4900
+ if SearchKey.DATE not in search_keys.values() and SearchKey.DATETIME not in search_keys.values():
4901
+ maybe_keys = DateSearchKeyDetector().get_search_key_columns(sample, search_keys)
4902
+ if len(maybe_keys) > 0:
4903
+ datetime_key = maybe_keys[0]
4904
+ new_keys = {datetime_key: SearchKey.DATETIME}
4821
4905
  search_keys.update(new_keys)
4822
- self.autodetected_search_keys.update(new_keys)
4823
- self.logger.info(f"Autodetected search key POSTAL_CODE in column {maybe_keys}")
4824
- if not silent_mode:
4825
- print(self.bundle.get("postal_code_detected").format(maybe_keys))
4906
+ self._add_autodetected_search_keys(new_keys)
4907
+ self.logger.info(f"Autodetected search key DATETIME in column {datetime_key}")
4908
+ print(self.bundle.get("datetime_detected").format(datetime_key))
4826
4909
 
4827
- if (
4828
- SearchKey.COUNTRY not in search_keys.values()
4829
- and self.country_code is None
4830
- and check_need_detect(SearchKey.COUNTRY)
4831
- ):
4910
+ # if SearchKey.POSTAL_CODE not in search_keys.values() and check_need_detect(SearchKey.POSTAL_CODE):
4911
+ maybe_keys = PostalCodeSearchKeyDetector().get_search_key_columns(sample, search_keys)
4912
+ if maybe_keys:
4913
+ new_keys = {key: SearchKey.POSTAL_CODE for key in maybe_keys}
4914
+ search_keys.update(new_keys)
4915
+ self._add_autodetected_search_keys(new_keys)
4916
+ self.logger.info(f"Autodetected search key POSTAL_CODE in column {maybe_keys}")
4917
+ print(self.bundle.get("postal_code_detected").format(maybe_keys))
4918
+
4919
+ if SearchKey.COUNTRY not in search_keys.values() and self.country_code is None:
4832
4920
  maybe_key = CountrySearchKeyDetector().get_search_key_columns(sample, search_keys)
4833
4921
  if maybe_key:
4834
- search_keys[maybe_key[0]] = SearchKey.COUNTRY
4835
- self.autodetected_search_keys[maybe_key[0]] = SearchKey.COUNTRY
4922
+ new_keys = {maybe_key[0]: SearchKey.COUNTRY}
4923
+ search_keys.update(new_keys)
4924
+ self._add_autodetected_search_keys(new_keys)
4836
4925
  self.logger.info(f"Autodetected search key COUNTRY in column {maybe_key}")
4837
- if not silent_mode:
4838
- print(self.bundle.get("country_detected").format(maybe_key))
4926
+ print(self.bundle.get("country_detected").format(maybe_key))
4839
4927
 
4840
- if (
4841
- # SearchKey.EMAIL not in search_keys.values()
4842
- SearchKey.HEM not in search_keys.values()
4843
- and check_need_detect(SearchKey.HEM)
4844
- ):
4928
+ if SearchKey.EMAIL not in search_keys.values() and SearchKey.HEM not in search_keys.values():
4845
4929
  maybe_keys = EmailSearchKeyDetector().get_search_key_columns(sample, search_keys)
4846
4930
  if maybe_keys:
4847
4931
  if self.__is_registered or is_demo_dataset:
4848
4932
  new_keys = {key: SearchKey.EMAIL for key in maybe_keys}
4849
4933
  search_keys.update(new_keys)
4850
- self.autodetected_search_keys.update(new_keys)
4934
+ self._add_autodetected_search_keys(new_keys)
4851
4935
  self.logger.info(f"Autodetected search key EMAIL in column {maybe_keys}")
4852
- if not silent_mode:
4853
- print(self.bundle.get("email_detected").format(maybe_keys))
4936
+ print(self.bundle.get("email_detected").format(maybe_keys))
4854
4937
  else:
4855
4938
  self.logger.warning(
4856
4939
  f"Autodetected search key EMAIL in column {maybe_keys}."
4857
4940
  " But not used because not registered user"
4858
4941
  )
4859
- if not silent_mode:
4860
- self.__log_warning(self.bundle.get("email_detected_not_registered").format(maybe_keys))
4942
+ self.__log_warning(self.bundle.get("email_detected_not_registered").format(maybe_keys))
4861
4943
 
4862
4944
  # if SearchKey.PHONE not in search_keys.values() and check_need_detect(SearchKey.PHONE):
4863
- if check_need_detect(SearchKey.PHONE):
4864
- maybe_keys = PhoneSearchKeyDetector().get_search_key_columns(sample, search_keys)
4865
- if maybe_keys:
4866
- if self.__is_registered or is_demo_dataset:
4867
- new_keys = {key: SearchKey.PHONE for key in maybe_keys}
4868
- search_keys.update(new_keys)
4869
- self.autodetected_search_keys.update(new_keys)
4870
- self.logger.info(f"Autodetected search key PHONE in column {maybe_keys}")
4871
- if not silent_mode:
4872
- print(self.bundle.get("phone_detected").format(maybe_keys))
4873
- else:
4874
- self.logger.warning(
4875
- f"Autodetected search key PHONE in column {maybe_keys}. "
4876
- "But not used because not registered user"
4877
- )
4878
- if not silent_mode:
4879
- self.__log_warning(self.bundle.get("phone_detected_not_registered"))
4945
+ maybe_keys = PhoneSearchKeyDetector().get_search_key_columns(sample, search_keys)
4946
+ if maybe_keys:
4947
+ if self.__is_registered or is_demo_dataset:
4948
+ new_keys = {key: SearchKey.PHONE for key in maybe_keys}
4949
+ search_keys.update(new_keys)
4950
+ self._add_autodetected_search_keys(new_keys)
4951
+ self.logger.info(f"Autodetected search key PHONE in column {maybe_keys}")
4952
+ print(self.bundle.get("phone_detected").format(maybe_keys))
4953
+ else:
4954
+ self.logger.warning(
4955
+ f"Autodetected search key PHONE in column {maybe_keys}. " "But not used because not registered user"
4956
+ )
4957
+ self.__log_warning(self.bundle.get("phone_detected_not_registered"))
4880
4958
 
4881
4959
  return search_keys
4882
4960
 
@@ -4948,13 +5026,12 @@ if response.status_code == 200:
4948
5026
 
4949
5027
  def dump_input(
4950
5028
  self,
4951
- trace_id: str,
4952
5029
  X: pd.DataFrame | pd.Series,
4953
5030
  y: pd.DataFrame | pd.Series | None = None,
4954
5031
  eval_set: tuple | None = None,
4955
5032
  ):
4956
- def dump_task(X_, y_, eval_set_):
4957
- with MDC(trace_id=trace_id):
5033
+ def dump_task(X_, y_, eval_set_, trace_id_):
5034
+ with MDC(correlation_id=trace_id_):
4958
5035
  try:
4959
5036
  if isinstance(X_, pd.Series):
4960
5037
  X_ = X_.to_frame()
@@ -4962,13 +5039,13 @@ if response.status_code == 200:
4962
5039
  with tempfile.TemporaryDirectory() as tmp_dir:
4963
5040
  X_.to_parquet(f"{tmp_dir}/x.parquet", compression="zstd")
4964
5041
  x_digest_sha256 = file_hash(f"{tmp_dir}/x.parquet")
4965
- if self.rest_client.is_file_uploaded(trace_id, x_digest_sha256):
5042
+ if self.rest_client.is_file_uploaded(trace_id_, x_digest_sha256):
4966
5043
  self.logger.info(
4967
5044
  f"File x.parquet was already uploaded with digest {x_digest_sha256}, skipping"
4968
5045
  )
4969
5046
  else:
4970
5047
  self.rest_client.dump_input_file(
4971
- trace_id, f"{tmp_dir}/x.parquet", "x.parquet", x_digest_sha256
5048
+ trace_id_, f"{tmp_dir}/x.parquet", "x.parquet", x_digest_sha256
4972
5049
  )
4973
5050
 
4974
5051
  if y_ is not None:
@@ -4976,13 +5053,13 @@ if response.status_code == 200:
4976
5053
  y_ = y_.to_frame()
4977
5054
  y_.to_parquet(f"{tmp_dir}/y.parquet", compression="zstd")
4978
5055
  y_digest_sha256 = file_hash(f"{tmp_dir}/y.parquet")
4979
- if self.rest_client.is_file_uploaded(trace_id, y_digest_sha256):
5056
+ if self.rest_client.is_file_uploaded(trace_id_, y_digest_sha256):
4980
5057
  self.logger.info(
4981
5058
  f"File y.parquet was already uploaded with digest {y_digest_sha256}, skipping"
4982
5059
  )
4983
5060
  else:
4984
5061
  self.rest_client.dump_input_file(
4985
- trace_id, f"{tmp_dir}/y.parquet", "y.parquet", y_digest_sha256
5062
+ trace_id_, f"{tmp_dir}/y.parquet", "y.parquet", y_digest_sha256
4986
5063
  )
4987
5064
 
4988
5065
  if eval_set_ is not None and len(eval_set_) > 0:
@@ -4991,14 +5068,14 @@ if response.status_code == 200:
4991
5068
  eval_x_ = eval_x_.to_frame()
4992
5069
  eval_x_.to_parquet(f"{tmp_dir}/eval_x_{idx}.parquet", compression="zstd")
4993
5070
  eval_x_digest_sha256 = file_hash(f"{tmp_dir}/eval_x_{idx}.parquet")
4994
- if self.rest_client.is_file_uploaded(trace_id, eval_x_digest_sha256):
5071
+ if self.rest_client.is_file_uploaded(trace_id_, eval_x_digest_sha256):
4995
5072
  self.logger.info(
4996
5073
  f"File eval_x_{idx}.parquet was already uploaded with"
4997
5074
  f" digest {eval_x_digest_sha256}, skipping"
4998
5075
  )
4999
5076
  else:
5000
5077
  self.rest_client.dump_input_file(
5001
- trace_id,
5078
+ trace_id_,
5002
5079
  f"{tmp_dir}/eval_x_{idx}.parquet",
5003
5080
  f"eval_x_{idx}.parquet",
5004
5081
  eval_x_digest_sha256,
@@ -5008,14 +5085,14 @@ if response.status_code == 200:
5008
5085
  eval_y_ = eval_y_.to_frame()
5009
5086
  eval_y_.to_parquet(f"{tmp_dir}/eval_y_{idx}.parquet", compression="zstd")
5010
5087
  eval_y_digest_sha256 = file_hash(f"{tmp_dir}/eval_y_{idx}.parquet")
5011
- if self.rest_client.is_file_uploaded(trace_id, eval_y_digest_sha256):
5088
+ if self.rest_client.is_file_uploaded(trace_id_, eval_y_digest_sha256):
5012
5089
  self.logger.info(
5013
5090
  f"File eval_y_{idx}.parquet was already uploaded"
5014
5091
  f" with digest {eval_y_digest_sha256}, skipping"
5015
5092
  )
5016
5093
  else:
5017
5094
  self.rest_client.dump_input_file(
5018
- trace_id,
5095
+ trace_id_,
5019
5096
  f"{tmp_dir}/eval_y_{idx}.parquet",
5020
5097
  f"eval_y_{idx}.parquet",
5021
5098
  eval_y_digest_sha256,
@@ -5024,7 +5101,8 @@ if response.status_code == 200:
5024
5101
  self.logger.warning("Failed to dump input files", exc_info=True)
5025
5102
 
5026
5103
  try:
5027
- Thread(target=dump_task, args=(X, y, eval_set), daemon=True).start()
5104
+ trace_id = self._get_trace_id()
5105
+ Thread(target=dump_task, args=(X, y, eval_set, trace_id), daemon=True).start()
5028
5106
  except Exception:
5029
5107
  self.logger.warning("Failed to dump input files", exc_info=True)
5030
5108