upgini 1.2.124__py3-none-any.whl → 1.2.146a4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- upgini/__about__.py +1 -1
- upgini/autofe/binary.py +4 -3
- upgini/data_source/data_source_publisher.py +1 -9
- upgini/dataset.py +56 -6
- upgini/features_enricher.py +634 -556
- upgini/http.py +2 -2
- upgini/metadata.py +16 -2
- upgini/normalizer/normalize_utils.py +6 -6
- upgini/resource_bundle/strings.properties +15 -11
- upgini/search_task.py +14 -2
- upgini/utils/base_search_key_detector.py +5 -1
- upgini/utils/datetime_utils.py +125 -39
- upgini/utils/deduplicate_utils.py +8 -5
- upgini/utils/display_utils.py +61 -20
- upgini/utils/feature_info.py +18 -7
- upgini/utils/features_validator.py +6 -4
- upgini/utils/postal_code_utils.py +35 -2
- upgini/utils/target_utils.py +3 -1
- upgini/utils/track_info.py +29 -1
- {upgini-1.2.124.dist-info → upgini-1.2.146a4.dist-info}/METADATA +123 -121
- {upgini-1.2.124.dist-info → upgini-1.2.146a4.dist-info}/RECORD +23 -23
- {upgini-1.2.124.dist-info → upgini-1.2.146a4.dist-info}/WHEEL +1 -1
- {upgini-1.2.124.dist-info → upgini-1.2.146a4.dist-info}/licenses/LICENSE +0 -0
upgini/features_enricher.py
CHANGED
|
@@ -42,8 +42,10 @@ from upgini.http import (
|
|
|
42
42
|
get_rest_client,
|
|
43
43
|
)
|
|
44
44
|
from upgini.mdc import MDC
|
|
45
|
+
from upgini.mdc.context import get_mdc_fields
|
|
45
46
|
from upgini.metadata import (
|
|
46
47
|
COUNTRY,
|
|
48
|
+
CURRENT_DATE_COL,
|
|
47
49
|
DEFAULT_INDEX,
|
|
48
50
|
ENTITY_SYSTEM_RECORD_ID,
|
|
49
51
|
EVAL_SET_INDEX,
|
|
@@ -76,7 +78,8 @@ from upgini.utils.custom_loss_utils import (
|
|
|
76
78
|
)
|
|
77
79
|
from upgini.utils.cv_utils import CVConfig, get_groups
|
|
78
80
|
from upgini.utils.datetime_utils import (
|
|
79
|
-
|
|
81
|
+
DateSearchKeyDetector,
|
|
82
|
+
DateTimeConverter,
|
|
80
83
|
is_blocked_time_series,
|
|
81
84
|
is_dates_distribution_valid,
|
|
82
85
|
is_time_series,
|
|
@@ -167,7 +170,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
167
170
|
"""
|
|
168
171
|
|
|
169
172
|
TARGET_NAME = "target"
|
|
170
|
-
CURRENT_DATE = "current_date"
|
|
171
173
|
RANDOM_STATE = 42
|
|
172
174
|
CALCULATE_METRICS_THRESHOLD = 50_000_000
|
|
173
175
|
CALCULATE_METRICS_MIN_THRESHOLD = 500
|
|
@@ -220,7 +222,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
220
222
|
cv: CVType | None = None,
|
|
221
223
|
loss: str | None = None,
|
|
222
224
|
autodetect_search_keys: bool = True,
|
|
225
|
+
# deprecated, use text_features instead
|
|
223
226
|
generate_features: list[str] | None = None,
|
|
227
|
+
text_features: list[str] | None = None,
|
|
224
228
|
columns_for_online_api: list[str] | None = None,
|
|
225
229
|
round_embeddings: int | None = None,
|
|
226
230
|
logs_enabled: bool = True,
|
|
@@ -236,6 +240,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
236
240
|
generate_search_key_features: bool = True,
|
|
237
241
|
sample_config: SampleConfig | None = None,
|
|
238
242
|
print_trace_id: bool = False,
|
|
243
|
+
print_loaded_report: bool = True,
|
|
239
244
|
**kwargs,
|
|
240
245
|
):
|
|
241
246
|
self.bundle = get_custom_bundle(custom_bundle_config)
|
|
@@ -269,7 +274,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
269
274
|
self.X: pd.DataFrame | None = None
|
|
270
275
|
self.y: pd.Series | None = None
|
|
271
276
|
self.eval_set: list[tuple] | None = None
|
|
272
|
-
self.autodetected_search_keys: dict[str, SearchKey] =
|
|
277
|
+
self.autodetected_search_keys: dict[str, SearchKey] | None = None
|
|
273
278
|
self.imbalanced = False
|
|
274
279
|
self.fit_select_features = True
|
|
275
280
|
self.__cached_sampled_datasets: dict[str, tuple[pd.DataFrame, pd.DataFrame, pd.Series, dict, dict, dict]] = (
|
|
@@ -282,7 +287,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
282
287
|
self.id_columns = id_columns
|
|
283
288
|
self.id_columns_encoder = None
|
|
284
289
|
self.country_code = country_code
|
|
285
|
-
self.__validate_search_keys(search_keys, search_id)
|
|
290
|
+
self.__validate_search_keys(self.search_keys, search_id)
|
|
286
291
|
|
|
287
292
|
self.model_task_type = ModelTaskType.parse(model_task_type)
|
|
288
293
|
self.endpoint = endpoint
|
|
@@ -305,10 +310,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
305
310
|
search_task = SearchTask(search_id, rest_client=self.rest_client, logger=self.logger)
|
|
306
311
|
|
|
307
312
|
print(self.bundle.get("search_by_task_id_start"))
|
|
308
|
-
trace_id =
|
|
309
|
-
|
|
310
|
-
print(f"https://app.datadoghq.eu/logs?query=%40trace_id%3A{trace_id}")
|
|
311
|
-
with MDC(trace_id=trace_id):
|
|
313
|
+
trace_id = self._get_trace_id()
|
|
314
|
+
with MDC(correlation_id=trace_id, search_task_id=search_id):
|
|
312
315
|
try:
|
|
313
316
|
self.logger.debug(f"FeaturesEnricher created from existing search: {search_id}")
|
|
314
317
|
self._search_task = search_task.poll_result(trace_id, quiet=True, check_fit=True)
|
|
@@ -316,8 +319,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
316
319
|
x_columns = [c.name for c in file_metadata.columns]
|
|
317
320
|
self.fit_columns_renaming = {c.name: c.originalName for c in file_metadata.columns}
|
|
318
321
|
df = pd.DataFrame(columns=x_columns)
|
|
319
|
-
self.__prepare_feature_importances(
|
|
320
|
-
|
|
322
|
+
self.__prepare_feature_importances(df, silent=True, update_selected_features=False)
|
|
323
|
+
if print_loaded_report:
|
|
324
|
+
self.__show_selected_features()
|
|
321
325
|
# TODO validate search_keys with search_keys from file_metadata
|
|
322
326
|
print(self.bundle.get("search_by_task_id_finish"))
|
|
323
327
|
self.logger.debug(f"Successfully initialized with search_id: {search_id}")
|
|
@@ -342,14 +346,14 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
342
346
|
self.shared_datasets = shared_datasets
|
|
343
347
|
if shared_datasets is not None:
|
|
344
348
|
self.runtime_parameters.properties["shared_datasets"] = ",".join(shared_datasets)
|
|
345
|
-
self.generate_features = generate_features
|
|
349
|
+
self.generate_features = text_features or generate_features
|
|
346
350
|
self.round_embeddings = round_embeddings
|
|
347
|
-
if generate_features is not None:
|
|
348
|
-
if len(generate_features) > self.GENERATE_FEATURES_LIMIT:
|
|
351
|
+
if self.generate_features is not None:
|
|
352
|
+
if len(self.generate_features) > self.GENERATE_FEATURES_LIMIT:
|
|
349
353
|
msg = self.bundle.get("too_many_generate_features").format(self.GENERATE_FEATURES_LIMIT)
|
|
350
354
|
self.logger.error(msg)
|
|
351
355
|
raise ValidationError(msg)
|
|
352
|
-
self.runtime_parameters.properties["generate_features"] = ",".join(generate_features)
|
|
356
|
+
self.runtime_parameters.properties["generate_features"] = ",".join(self.generate_features)
|
|
353
357
|
if round_embeddings is not None:
|
|
354
358
|
if not isinstance(round_embeddings, int) or round_embeddings < 0:
|
|
355
359
|
msg = self.bundle.get("invalid_round_embeddings")
|
|
@@ -484,9 +488,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
484
488
|
stability_agg_func: str, optional (default="max")
|
|
485
489
|
Function to aggregate stability values. Can be "max", "min", "mean".
|
|
486
490
|
"""
|
|
487
|
-
trace_id =
|
|
491
|
+
trace_id = self._get_trace_id()
|
|
488
492
|
if self.print_trace_id:
|
|
489
|
-
print(f"https://app.datadoghq.eu/logs?query=%
|
|
493
|
+
print(f"https://app.datadoghq.eu/logs?query=%40correlation_id%3A{trace_id}")
|
|
490
494
|
start_time = time.time()
|
|
491
495
|
auto_fe_parameters = AutoFEParameters() if auto_fe_parameters is None else auto_fe_parameters
|
|
492
496
|
search_progress = SearchProgress(0.0, ProgressStage.START_FIT)
|
|
@@ -498,7 +502,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
498
502
|
progress_bar.progress = search_progress.to_progress_bar()
|
|
499
503
|
progress_bar.display()
|
|
500
504
|
|
|
501
|
-
with MDC(
|
|
505
|
+
with MDC(correlation_id=trace_id):
|
|
502
506
|
if len(args) > 0:
|
|
503
507
|
msg = f"WARNING: Unsupported positional arguments for fit: {args}"
|
|
504
508
|
self.logger.warning(msg)
|
|
@@ -519,10 +523,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
519
523
|
self.X = X
|
|
520
524
|
self.y = y
|
|
521
525
|
self.eval_set = self._check_eval_set(eval_set, X)
|
|
522
|
-
self.dump_input(
|
|
526
|
+
self.dump_input(X, y, self.eval_set)
|
|
523
527
|
self.__set_select_features(select_features)
|
|
524
528
|
self.__inner_fit(
|
|
525
|
-
trace_id,
|
|
526
529
|
X,
|
|
527
530
|
y,
|
|
528
531
|
self.eval_set,
|
|
@@ -643,11 +646,11 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
643
646
|
|
|
644
647
|
self.warning_counter.reset()
|
|
645
648
|
auto_fe_parameters = AutoFEParameters() if auto_fe_parameters is None else auto_fe_parameters
|
|
646
|
-
trace_id =
|
|
649
|
+
trace_id = self._get_trace_id()
|
|
647
650
|
if self.print_trace_id:
|
|
648
|
-
print(f"https://app.datadoghq.eu/logs?query=%
|
|
651
|
+
print(f"https://app.datadoghq.eu/logs?query=%40correlation_id%3A{trace_id}")
|
|
649
652
|
start_time = time.time()
|
|
650
|
-
with MDC(
|
|
653
|
+
with MDC(correlation_id=trace_id):
|
|
651
654
|
if len(args) > 0:
|
|
652
655
|
msg = f"WARNING: Unsupported positional arguments for fit_transform: {args}"
|
|
653
656
|
self.logger.warning(msg)
|
|
@@ -674,13 +677,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
674
677
|
self.y = y
|
|
675
678
|
self.eval_set = self._check_eval_set(eval_set, X)
|
|
676
679
|
self.__set_select_features(select_features)
|
|
677
|
-
self.dump_input(
|
|
678
|
-
|
|
679
|
-
if _num_samples(drop_duplicates(X)) > Dataset.MAX_ROWS:
|
|
680
|
-
raise ValidationError(self.bundle.get("dataset_too_many_rows_registered").format(Dataset.MAX_ROWS))
|
|
680
|
+
self.dump_input(X, y, self.eval_set)
|
|
681
681
|
|
|
682
682
|
self.__inner_fit(
|
|
683
|
-
trace_id,
|
|
684
683
|
X,
|
|
685
684
|
y,
|
|
686
685
|
self.eval_set,
|
|
@@ -732,9 +731,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
732
731
|
|
|
733
732
|
result = self.transform(
|
|
734
733
|
X,
|
|
734
|
+
y,
|
|
735
735
|
exclude_features_sources=exclude_features_sources,
|
|
736
736
|
keep_input=keep_input,
|
|
737
|
-
trace_id=trace_id,
|
|
738
737
|
silent_mode=True,
|
|
739
738
|
progress_bar=progress_bar,
|
|
740
739
|
progress_callback=progress_callback,
|
|
@@ -745,12 +744,10 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
745
744
|
def transform(
|
|
746
745
|
self,
|
|
747
746
|
X: pd.DataFrame,
|
|
748
|
-
*args,
|
|
749
747
|
y: pd.Series | None = None,
|
|
748
|
+
*args,
|
|
750
749
|
exclude_features_sources: list[str] | None = None,
|
|
751
750
|
keep_input: bool = True,
|
|
752
|
-
trace_id: str | None = None,
|
|
753
|
-
metrics_calculation: bool = False,
|
|
754
751
|
silent_mode=False,
|
|
755
752
|
progress_bar: ProgressBar | None = None,
|
|
756
753
|
progress_callback: Callable[[SearchProgress], Any] | None = None,
|
|
@@ -787,10 +784,12 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
787
784
|
progress_bar.progress = search_progress.to_progress_bar()
|
|
788
785
|
if new_progress:
|
|
789
786
|
progress_bar.display()
|
|
790
|
-
trace_id =
|
|
787
|
+
trace_id = self._get_trace_id()
|
|
788
|
+
if self.print_trace_id:
|
|
789
|
+
print(f"https://app.datadoghq.eu/logs?query=%40correlation_id%3A{trace_id}")
|
|
791
790
|
search_id = self.search_id or (self._search_task.search_task_id if self._search_task is not None else None)
|
|
792
|
-
with MDC(
|
|
793
|
-
self.dump_input(
|
|
791
|
+
with MDC(correlation_id=trace_id, search_id=search_id):
|
|
792
|
+
self.dump_input(X)
|
|
794
793
|
if len(args) > 0:
|
|
795
794
|
msg = f"WARNING: Unsupported positional arguments for transform: {args}"
|
|
796
795
|
self.logger.warning(msg)
|
|
@@ -803,15 +802,15 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
803
802
|
start_time = time.time()
|
|
804
803
|
try:
|
|
805
804
|
result, _, _, _ = self.__inner_transform(
|
|
806
|
-
trace_id,
|
|
807
805
|
X,
|
|
808
806
|
y=y,
|
|
809
807
|
exclude_features_sources=exclude_features_sources,
|
|
810
|
-
metrics_calculation=metrics_calculation,
|
|
811
808
|
silent_mode=silent_mode,
|
|
812
809
|
progress_bar=progress_bar,
|
|
813
810
|
keep_input=keep_input,
|
|
814
811
|
)
|
|
812
|
+
if result is not None and TARGET in result.columns:
|
|
813
|
+
result = result.drop(columns=TARGET)
|
|
815
814
|
self.logger.info("Transform finished successfully")
|
|
816
815
|
search_progress = SearchProgress(100.0, ProgressStage.FINISHED)
|
|
817
816
|
if progress_bar is not None:
|
|
@@ -866,7 +865,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
866
865
|
estimator=None,
|
|
867
866
|
exclude_features_sources: list[str] | None = None,
|
|
868
867
|
remove_outliers_calc_metrics: bool | None = None,
|
|
869
|
-
trace_id: str | None = None,
|
|
870
868
|
internal_call: bool = False,
|
|
871
869
|
progress_bar: ProgressBar | None = None,
|
|
872
870
|
progress_callback: Callable[[SearchProgress], Any] | None = None,
|
|
@@ -904,10 +902,10 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
904
902
|
Dataframe with metrics calculated on train and validation datasets.
|
|
905
903
|
"""
|
|
906
904
|
|
|
907
|
-
trace_id =
|
|
905
|
+
trace_id = self._get_trace_id()
|
|
908
906
|
start_time = time.time()
|
|
909
907
|
search_id = self.search_id or (self._search_task.search_task_id if self._search_task is not None else None)
|
|
910
|
-
with MDC(
|
|
908
|
+
with MDC(correlation_id=trace_id, search_id=search_id):
|
|
911
909
|
self.logger.info("Start calculate metrics")
|
|
912
910
|
if len(args) > 0:
|
|
913
911
|
msg = f"WARNING: Unsupported positional arguments for calculate_metrics: {args}"
|
|
@@ -937,7 +935,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
937
935
|
raise ValidationError(self.bundle.get("metrics_unfitted_enricher"))
|
|
938
936
|
|
|
939
937
|
validated_X, validated_y, validated_eval_set = self._validate_train_eval(
|
|
940
|
-
effective_X, effective_y, effective_eval_set
|
|
938
|
+
effective_X, effective_y, effective_eval_set
|
|
941
939
|
)
|
|
942
940
|
|
|
943
941
|
if self.X is None:
|
|
@@ -972,11 +970,13 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
972
970
|
self.__display_support_link(msg)
|
|
973
971
|
return None
|
|
974
972
|
|
|
973
|
+
search_keys = self._get_fit_search_keys_with_original_names()
|
|
974
|
+
|
|
975
975
|
cat_features_from_backend = self.__get_categorical_features()
|
|
976
976
|
# Convert to original names
|
|
977
977
|
cat_features_from_backend = [self.fit_columns_renaming.get(c, c) for c in cat_features_from_backend]
|
|
978
978
|
client_cat_features, search_keys_for_metrics = self._get_and_validate_client_cat_features(
|
|
979
|
-
estimator, validated_X,
|
|
979
|
+
estimator, validated_X, search_keys
|
|
980
980
|
)
|
|
981
981
|
# Exclude id columns from cat_features
|
|
982
982
|
if self.id_columns and self.id_columns_encoder is not None:
|
|
@@ -998,7 +998,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
998
998
|
self.logger.info(f"Search keys for metrics: {search_keys_for_metrics}")
|
|
999
999
|
|
|
1000
1000
|
prepared_data = self._get_cached_enriched_data(
|
|
1001
|
-
trace_id=trace_id,
|
|
1002
1001
|
X=X,
|
|
1003
1002
|
y=y,
|
|
1004
1003
|
eval_set=eval_set,
|
|
@@ -1044,7 +1043,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1044
1043
|
with Spinner():
|
|
1045
1044
|
self._check_train_and_eval_target_distribution(y_sorted, fitting_eval_set_dict)
|
|
1046
1045
|
|
|
1047
|
-
|
|
1046
|
+
date_col = self._get_date_column(search_keys)
|
|
1047
|
+
has_date = date_col is not None and date_col in validated_X.columns
|
|
1048
1048
|
model_task_type = self.model_task_type or define_task(y_sorted, has_date, self.logger, silent=True)
|
|
1049
1049
|
cat_features = list(set(client_cat_features + cat_features_from_backend))
|
|
1050
1050
|
has_time = has_date and isinstance(_cv, TimeSeriesSplit) or isinstance(_cv, BlockedTimeSeriesSplit)
|
|
@@ -1250,7 +1250,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1250
1250
|
|
|
1251
1251
|
if updating_shaps is not None:
|
|
1252
1252
|
decoded_X = self._decode_id_columns(fitting_X)
|
|
1253
|
-
self._update_shap_values(
|
|
1253
|
+
self._update_shap_values(decoded_X, updating_shaps, silent=not internal_call)
|
|
1254
1254
|
|
|
1255
1255
|
metrics_df = pd.DataFrame(metrics)
|
|
1256
1256
|
mean_target_hdr = self.bundle.get("quality_metrics_mean_target_header")
|
|
@@ -1300,9 +1300,40 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1300
1300
|
finally:
|
|
1301
1301
|
self.logger.info(f"Calculating metrics elapsed time: {time.time() - start_time}")
|
|
1302
1302
|
|
|
1303
|
+
def _get_trace_id(self):
|
|
1304
|
+
if get_mdc_fields().get("correlation_id") is not None:
|
|
1305
|
+
return get_mdc_fields().get("correlation_id")
|
|
1306
|
+
return int(time.time() * 1000)
|
|
1307
|
+
|
|
1308
|
+
def _get_autodetected_search_keys(self):
|
|
1309
|
+
if self.autodetected_search_keys is None and self._search_task is not None:
|
|
1310
|
+
meta = self._search_task.get_file_metadata(self._get_trace_id())
|
|
1311
|
+
autodetected_search_keys = meta.autodetectedSearchKeys or {}
|
|
1312
|
+
self.autodetected_search_keys = {k: SearchKey[v] for k, v in autodetected_search_keys.items()}
|
|
1313
|
+
|
|
1314
|
+
return self.autodetected_search_keys
|
|
1315
|
+
|
|
1316
|
+
def _add_autodetected_search_keys(self, adding_search_keys: dict[str, SearchKey]):
|
|
1317
|
+
if self.autodetected_search_keys is None:
|
|
1318
|
+
self.autodetected_search_keys = dict()
|
|
1319
|
+
self.autodetected_search_keys.update(adding_search_keys)
|
|
1320
|
+
return self.autodetected_search_keys
|
|
1321
|
+
|
|
1322
|
+
def _get_fit_search_keys_with_original_names(self):
|
|
1323
|
+
if self.fit_search_keys is None and self._search_task is not None:
|
|
1324
|
+
fit_search_keys = dict()
|
|
1325
|
+
meta = self._search_task.get_file_metadata(self._get_trace_id())
|
|
1326
|
+
for column in meta.columns:
|
|
1327
|
+
# TODO check for EMAIL->HEM and multikeys
|
|
1328
|
+
search_key_type = SearchKey.from_meaning_type(column.meaningType)
|
|
1329
|
+
if search_key_type is not None:
|
|
1330
|
+
fit_search_keys[column.originalName] = search_key_type
|
|
1331
|
+
else:
|
|
1332
|
+
fit_search_keys = {self.fit_columns_renaming.get(k, k): v for k, v in self.fit_search_keys.items()}
|
|
1333
|
+
return fit_search_keys
|
|
1334
|
+
|
|
1303
1335
|
def _select_features_by_psi(
|
|
1304
1336
|
self,
|
|
1305
|
-
trace_id: str,
|
|
1306
1337
|
X: pd.DataFrame | pd.Series | np.ndarray,
|
|
1307
1338
|
y: pd.DataFrame | pd.Series | np.ndarray | list,
|
|
1308
1339
|
eval_set: list[tuple] | tuple | None,
|
|
@@ -1315,12 +1346,13 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1315
1346
|
progress_callback: Callable | None = None,
|
|
1316
1347
|
):
|
|
1317
1348
|
search_keys = self.search_keys.copy()
|
|
1318
|
-
|
|
1349
|
+
search_keys.update(self._get_autodetected_search_keys())
|
|
1350
|
+
validated_X, _, validated_eval_set = self._validate_train_eval(X, y, eval_set)
|
|
1319
1351
|
if isinstance(X, np.ndarray):
|
|
1320
1352
|
search_keys = {str(k): v for k, v in search_keys.items()}
|
|
1321
1353
|
|
|
1322
1354
|
date_column = self._get_date_column(search_keys)
|
|
1323
|
-
has_date = date_column is not None
|
|
1355
|
+
has_date = date_column is not None and date_column in validated_X.columns
|
|
1324
1356
|
if not has_date:
|
|
1325
1357
|
self.logger.info("No date column for OOT PSI calculation")
|
|
1326
1358
|
return
|
|
@@ -1350,7 +1382,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1350
1382
|
]
|
|
1351
1383
|
|
|
1352
1384
|
prepared_data = self._get_cached_enriched_data(
|
|
1353
|
-
trace_id=trace_id,
|
|
1354
1385
|
X=X,
|
|
1355
1386
|
y=y,
|
|
1356
1387
|
eval_set=eval_set,
|
|
@@ -1415,13 +1446,11 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1415
1446
|
# Find latest eval set or earliest if all eval sets are before train set
|
|
1416
1447
|
date_column = self._get_date_column(search_keys)
|
|
1417
1448
|
|
|
1418
|
-
date_converter =
|
|
1449
|
+
date_converter = DateTimeConverter(
|
|
1419
1450
|
date_column, self.date_format, self.logger, self.bundle, generate_cyclical_features=False
|
|
1420
1451
|
)
|
|
1421
1452
|
|
|
1422
|
-
|
|
1423
|
-
|
|
1424
|
-
x_date = X[date_column].dropna()
|
|
1453
|
+
x_date = date_converter.to_date_ms(X).dropna()
|
|
1425
1454
|
if len(x_date) == 0:
|
|
1426
1455
|
self.logger.warning("Empty date column in X")
|
|
1427
1456
|
return []
|
|
@@ -1434,8 +1463,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1434
1463
|
if date_column not in eval_x.columns:
|
|
1435
1464
|
self.logger.warning(f"Date column not found in eval_set {i + 1}")
|
|
1436
1465
|
continue
|
|
1437
|
-
|
|
1438
|
-
eval_x_date = eval_x[date_column].dropna()
|
|
1466
|
+
eval_x_date = date_converter.to_date_ms(eval_x).dropna()
|
|
1439
1467
|
if len(eval_x_date) < 1000:
|
|
1440
1468
|
self.logger.warning(f"Eval_set {i} has less than 1000 rows. It will be ignored for stability check")
|
|
1441
1469
|
continue
|
|
@@ -1472,8 +1500,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1472
1500
|
)
|
|
1473
1501
|
checking_eval_set_df = checking_eval_set_df.copy()
|
|
1474
1502
|
|
|
1475
|
-
checking_eval_set_df[date_column] = eval_set_dates[selected_eval_set_idx]
|
|
1476
|
-
checking_eval_set_df = date_converter.convert(checking_eval_set_df)
|
|
1503
|
+
checking_eval_set_df[date_column] = date_converter.to_date_ms(eval_set_dates[selected_eval_set_idx].to_frame())
|
|
1477
1504
|
|
|
1478
1505
|
psi_values_sparse = calculate_sparsity_psi(
|
|
1479
1506
|
checking_eval_set_df, cat_features, date_column, self.logger, model_task_type
|
|
@@ -1503,7 +1530,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1503
1530
|
|
|
1504
1531
|
return total_unstable_features
|
|
1505
1532
|
|
|
1506
|
-
def _update_shap_values(self,
|
|
1533
|
+
def _update_shap_values(self, df: pd.DataFrame, new_shaps: dict[str, float], silent: bool = False):
|
|
1507
1534
|
renaming = self.fit_columns_renaming or {}
|
|
1508
1535
|
self.logger.info(f"Updating SHAP values: {new_shaps}")
|
|
1509
1536
|
new_shaps = {
|
|
@@ -1511,7 +1538,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1511
1538
|
for feature, shap in new_shaps.items()
|
|
1512
1539
|
if feature in self.feature_names_ or renaming.get(feature, feature) in self.feature_names_
|
|
1513
1540
|
}
|
|
1514
|
-
self.__prepare_feature_importances(
|
|
1541
|
+
self.__prepare_feature_importances(df, new_shaps)
|
|
1515
1542
|
|
|
1516
1543
|
if not silent and self.features_info_display_handle is not None:
|
|
1517
1544
|
try:
|
|
@@ -1638,7 +1665,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1638
1665
|
|
|
1639
1666
|
if not isinstance(_cv, BaseCrossValidator):
|
|
1640
1667
|
date_column = self._get_date_column(search_keys)
|
|
1641
|
-
date_series = X[date_column] if date_column is not None else None
|
|
1668
|
+
date_series = X[date_column] if date_column is not None and date_column in X.columns else None
|
|
1642
1669
|
_cv, groups = CVConfig(
|
|
1643
1670
|
_cv, date_series, self.random_state, self._search_task.get_shuffle_kfold(), group_columns=group_columns
|
|
1644
1671
|
).get_cv_and_groups(X)
|
|
@@ -1691,7 +1718,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1691
1718
|
|
|
1692
1719
|
def _get_cached_enriched_data(
|
|
1693
1720
|
self,
|
|
1694
|
-
trace_id: str,
|
|
1695
1721
|
X: pd.DataFrame | pd.Series | np.ndarray | None = None,
|
|
1696
1722
|
y: pd.DataFrame | pd.Series | np.ndarray | list | None = None,
|
|
1697
1723
|
eval_set: list[tuple] | tuple | None = None,
|
|
@@ -1707,10 +1733,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1707
1733
|
is_input_same_as_fit, X, y, eval_set = self._is_input_same_as_fit(X, y, eval_set)
|
|
1708
1734
|
is_demo_dataset = hash_input(X, y, eval_set) in DEMO_DATASET_HASHES
|
|
1709
1735
|
checked_eval_set = self._check_eval_set(eval_set, X)
|
|
1710
|
-
validated_X, validated_y, validated_eval_set = self._validate_train_eval(X, y, checked_eval_set
|
|
1736
|
+
validated_X, validated_y, validated_eval_set = self._validate_train_eval(X, y, checked_eval_set)
|
|
1711
1737
|
|
|
1712
1738
|
sampled_data = self._get_enriched_datasets(
|
|
1713
|
-
trace_id=trace_id,
|
|
1714
1739
|
validated_X=validated_X,
|
|
1715
1740
|
validated_y=validated_y,
|
|
1716
1741
|
eval_set=validated_eval_set,
|
|
@@ -1737,17 +1762,24 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1737
1762
|
|
|
1738
1763
|
self.logger.info(f"Excluding search keys: {excluding_search_keys}")
|
|
1739
1764
|
|
|
1765
|
+
file_meta = self._search_task.get_file_metadata(self._get_trace_id())
|
|
1766
|
+
fit_dropped_features = self.fit_dropped_features or file_meta.droppedColumns or []
|
|
1767
|
+
original_dropped_features = [columns_renaming.get(f, f) for f in fit_dropped_features]
|
|
1768
|
+
|
|
1740
1769
|
client_features = [
|
|
1741
1770
|
c
|
|
1742
|
-
for c in
|
|
1771
|
+
for c in validated_X.columns.to_list()
|
|
1743
1772
|
if (not self.fit_select_features or c in set(self.feature_names_).union(self.id_columns or []))
|
|
1744
1773
|
and c
|
|
1745
1774
|
not in (
|
|
1746
1775
|
excluding_search_keys
|
|
1747
|
-
+
|
|
1748
|
-
+ [
|
|
1776
|
+
+ original_dropped_features
|
|
1777
|
+
+ [DateTimeConverter.DATETIME_COL, SYSTEM_RECORD_ID, ENTITY_SYSTEM_RECORD_ID]
|
|
1749
1778
|
)
|
|
1750
1779
|
]
|
|
1780
|
+
client_features.extend(f for f in generated_features if f in self.feature_names_)
|
|
1781
|
+
if self.baseline_score_column is not None and self.baseline_score_column not in client_features:
|
|
1782
|
+
client_features.append(self.baseline_score_column)
|
|
1751
1783
|
self.logger.info(f"Client features column on prepare data for metrics: {client_features}")
|
|
1752
1784
|
|
|
1753
1785
|
selected_enriched_features = [c for c in self.feature_names_ if c not in client_features]
|
|
@@ -1846,7 +1878,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1846
1878
|
enriched_eval_X_sorted, enriched_eval_y_sorted = self._sort_by_system_record_id(
|
|
1847
1879
|
enriched_eval_X, eval_y_sampled, self.cv
|
|
1848
1880
|
)
|
|
1849
|
-
if date_column is not None:
|
|
1881
|
+
if date_column is not None and date_column in eval_X_sorted.columns:
|
|
1850
1882
|
eval_set_dates[idx] = eval_X_sorted[date_column]
|
|
1851
1883
|
fitting_eval_X = eval_X_sorted[fitting_x_columns].copy()
|
|
1852
1884
|
fitting_enriched_eval_X = enriched_eval_X_sorted[fitting_enriched_x_columns].copy()
|
|
@@ -1907,7 +1939,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1907
1939
|
|
|
1908
1940
|
def _get_enriched_datasets(
|
|
1909
1941
|
self,
|
|
1910
|
-
trace_id: str,
|
|
1911
1942
|
validated_X: pd.DataFrame | pd.Series | np.ndarray | None,
|
|
1912
1943
|
validated_y: pd.DataFrame | pd.Series | np.ndarray | list | None,
|
|
1913
1944
|
eval_set: list[tuple] | None,
|
|
@@ -1935,7 +1966,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1935
1966
|
and self.df_with_original_index is not None
|
|
1936
1967
|
):
|
|
1937
1968
|
self.logger.info("Dataset is not imbalanced, so use enriched_X from fit")
|
|
1938
|
-
return self.__get_enriched_from_fit(
|
|
1969
|
+
return self.__get_enriched_from_fit(validated_X, validated_y, eval_set, remove_outliers_calc_metrics)
|
|
1939
1970
|
else:
|
|
1940
1971
|
self.logger.info(
|
|
1941
1972
|
"Dataset is imbalanced or exclude_features_sources or X was passed or this is saved search."
|
|
@@ -1947,7 +1978,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1947
1978
|
validated_y,
|
|
1948
1979
|
eval_set,
|
|
1949
1980
|
exclude_features_sources,
|
|
1950
|
-
trace_id,
|
|
1951
1981
|
progress_bar,
|
|
1952
1982
|
progress_callback,
|
|
1953
1983
|
is_for_metrics=is_for_metrics,
|
|
@@ -1995,7 +2025,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1995
2025
|
date_column = self._get_date_column(search_keys)
|
|
1996
2026
|
generated_features = []
|
|
1997
2027
|
if date_column is not None:
|
|
1998
|
-
converter =
|
|
2028
|
+
converter = DateTimeConverter(
|
|
1999
2029
|
date_column,
|
|
2000
2030
|
self.date_format,
|
|
2001
2031
|
self.logger,
|
|
@@ -2004,6 +2034,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2004
2034
|
)
|
|
2005
2035
|
# Leave original date column values
|
|
2006
2036
|
df_with_date_features = converter.convert(df, keep_time=True)
|
|
2037
|
+
# TODO check if this is correct
|
|
2007
2038
|
df_with_date_features[date_column] = df[date_column]
|
|
2008
2039
|
df = df_with_date_features
|
|
2009
2040
|
generated_features = converter.generated_features
|
|
@@ -2035,8 +2066,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2035
2066
|
# Sample after sorting by system_record_id for idempotency
|
|
2036
2067
|
df.sort_values(by=SYSTEM_RECORD_ID, inplace=True)
|
|
2037
2068
|
|
|
2038
|
-
if
|
|
2039
|
-
df = df.drop(columns=
|
|
2069
|
+
if DateTimeConverter.DATETIME_COL in df.columns:
|
|
2070
|
+
df = df.drop(columns=DateTimeConverter.DATETIME_COL)
|
|
2040
2071
|
|
|
2041
2072
|
df = df.rename(columns=columns_renaming)
|
|
2042
2073
|
generated_features = [columns_renaming.get(c, c) for c in generated_features]
|
|
@@ -2072,22 +2103,24 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2072
2103
|
|
|
2073
2104
|
def __get_enriched_from_fit(
|
|
2074
2105
|
self,
|
|
2106
|
+
validated_X: pd.DataFrame,
|
|
2107
|
+
validated_y: pd.Series,
|
|
2075
2108
|
eval_set: list[tuple] | None,
|
|
2076
|
-
trace_id: str,
|
|
2077
2109
|
remove_outliers_calc_metrics: bool | None,
|
|
2078
2110
|
) -> _EnrichedDataForMetrics:
|
|
2079
2111
|
eval_set_sampled_dict = {}
|
|
2080
2112
|
search_keys = self.fit_search_keys.copy()
|
|
2081
2113
|
|
|
2082
2114
|
rows_to_drop = None
|
|
2083
|
-
|
|
2115
|
+
date_column = self._get_date_column(search_keys)
|
|
2116
|
+
has_date = date_column is not None and date_column in validated_X.columns
|
|
2084
2117
|
self.model_task_type = self.model_task_type or define_task(
|
|
2085
2118
|
self.df_with_original_index[TARGET], has_date, self.logger, silent=True
|
|
2086
2119
|
)
|
|
2087
2120
|
if remove_outliers_calc_metrics is None:
|
|
2088
2121
|
remove_outliers_calc_metrics = True
|
|
2089
2122
|
if self.model_task_type == ModelTaskType.REGRESSION and remove_outliers_calc_metrics:
|
|
2090
|
-
target_outliers_df = self._search_task.get_target_outliers(
|
|
2123
|
+
target_outliers_df = self._search_task.get_target_outliers(self._get_trace_id())
|
|
2091
2124
|
if target_outliers_df is not None and len(target_outliers_df) > 0:
|
|
2092
2125
|
outliers = pd.merge(
|
|
2093
2126
|
self.df_with_original_index,
|
|
@@ -2104,7 +2137,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2104
2137
|
|
|
2105
2138
|
# index in each dataset (X, eval set) may be reordered and non unique, but index in validated datasets
|
|
2106
2139
|
# can differs from it
|
|
2107
|
-
fit_features = self._search_task.get_all_initial_raw_features(
|
|
2140
|
+
fit_features = self._search_task.get_all_initial_raw_features(self._get_trace_id(), metrics_calculation=True)
|
|
2108
2141
|
|
|
2109
2142
|
# Pre-process features if we need to drop outliers
|
|
2110
2143
|
if rows_to_drop is not None:
|
|
@@ -2122,6 +2155,24 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2122
2155
|
drop_system_record_id=False,
|
|
2123
2156
|
)
|
|
2124
2157
|
|
|
2158
|
+
enriched_Xy.rename(columns=self.fit_columns_renaming, inplace=True)
|
|
2159
|
+
search_keys = {self.fit_columns_renaming.get(k, k): v for k, v in search_keys.items()}
|
|
2160
|
+
generated_features = [self.fit_columns_renaming.get(c, c) for c in self.fit_generated_features]
|
|
2161
|
+
|
|
2162
|
+
validated_Xy = validated_X.copy()
|
|
2163
|
+
validated_Xy[TARGET] = validated_y
|
|
2164
|
+
|
|
2165
|
+
selecting_columns = self._selecting_input_and_generated_columns(
|
|
2166
|
+
validated_Xy, self.fit_generated_features, keep_input=True
|
|
2167
|
+
)
|
|
2168
|
+
selecting_columns.extend(
|
|
2169
|
+
c
|
|
2170
|
+
for c in enriched_Xy.columns
|
|
2171
|
+
if (c in self.feature_names_ and c not in selecting_columns and c not in validated_X.columns)
|
|
2172
|
+
or c in [EVAL_SET_INDEX, ENTITY_SYSTEM_RECORD_ID, SYSTEM_RECORD_ID]
|
|
2173
|
+
)
|
|
2174
|
+
enriched_Xy = enriched_Xy[selecting_columns]
|
|
2175
|
+
|
|
2125
2176
|
# Handle eval sets extraction based on EVAL_SET_INDEX
|
|
2126
2177
|
if EVAL_SET_INDEX in enriched_Xy.columns:
|
|
2127
2178
|
eval_set_indices = list(enriched_Xy[EVAL_SET_INDEX].unique())
|
|
@@ -2133,7 +2184,11 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2133
2184
|
].copy()
|
|
2134
2185
|
enriched_Xy = enriched_Xy.loc[enriched_Xy[EVAL_SET_INDEX] == 0].copy()
|
|
2135
2186
|
|
|
2136
|
-
x_columns = [
|
|
2187
|
+
x_columns = [
|
|
2188
|
+
c
|
|
2189
|
+
for c in [self.fit_columns_renaming.get(k, k) for k in self.df_with_original_index.columns]
|
|
2190
|
+
if c not in [EVAL_SET_INDEX, TARGET] and c in selecting_columns
|
|
2191
|
+
]
|
|
2137
2192
|
X_sampled = enriched_Xy[x_columns].copy()
|
|
2138
2193
|
y_sampled = enriched_Xy[TARGET].copy()
|
|
2139
2194
|
enriched_X = enriched_Xy.drop(columns=[TARGET, EVAL_SET_INDEX], errors="ignore")
|
|
@@ -2155,15 +2210,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2155
2210
|
enriched_eval_X = enriched_eval_sets[idx + 1][enriched_X_columns].copy()
|
|
2156
2211
|
eval_set_sampled_dict[idx] = (eval_X_sampled, enriched_eval_X, eval_y_sampled)
|
|
2157
2212
|
|
|
2158
|
-
# reversed_renaming = {v: k for k, v in self.fit_columns_renaming.items()}
|
|
2159
|
-
X_sampled.rename(columns=self.fit_columns_renaming, inplace=True)
|
|
2160
|
-
enriched_X.rename(columns=self.fit_columns_renaming, inplace=True)
|
|
2161
|
-
for _, (eval_X_sampled, enriched_eval_X, _) in eval_set_sampled_dict.items():
|
|
2162
|
-
eval_X_sampled.rename(columns=self.fit_columns_renaming, inplace=True)
|
|
2163
|
-
enriched_eval_X.rename(columns=self.fit_columns_renaming, inplace=True)
|
|
2164
|
-
search_keys = {self.fit_columns_renaming.get(k, k): v for k, v in search_keys.items()}
|
|
2165
|
-
generated_features = [self.fit_columns_renaming.get(c, c) for c in self.fit_generated_features]
|
|
2166
|
-
|
|
2167
2213
|
datasets_hash = hash_input(self.X, self.y, self.eval_set)
|
|
2168
2214
|
return self.__cache_and_return_results(
|
|
2169
2215
|
datasets_hash,
|
|
@@ -2182,7 +2228,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2182
2228
|
validated_y: pd.Series,
|
|
2183
2229
|
eval_set: list[tuple] | None,
|
|
2184
2230
|
exclude_features_sources: list[str] | None,
|
|
2185
|
-
trace_id: str,
|
|
2186
2231
|
progress_bar: ProgressBar | None,
|
|
2187
2232
|
progress_callback: Callable[[SearchProgress], Any] | None,
|
|
2188
2233
|
is_for_metrics: bool = False,
|
|
@@ -2208,7 +2253,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2208
2253
|
|
|
2209
2254
|
# Transform
|
|
2210
2255
|
enriched_df, columns_renaming, generated_features, search_keys = self.__inner_transform(
|
|
2211
|
-
trace_id,
|
|
2212
2256
|
X=df.drop(columns=[TARGET]),
|
|
2213
2257
|
y=df[TARGET],
|
|
2214
2258
|
exclude_features_sources=exclude_features_sources,
|
|
@@ -2385,11 +2429,10 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2385
2429
|
|
|
2386
2430
|
return self.features_info
|
|
2387
2431
|
|
|
2388
|
-
def get_progress(self,
|
|
2432
|
+
def get_progress(self, search_task: SearchTask | None = None) -> SearchProgress:
|
|
2389
2433
|
search_task = search_task or self._search_task
|
|
2390
2434
|
if search_task is not None:
|
|
2391
|
-
|
|
2392
|
-
return search_task.get_progress(trace_id)
|
|
2435
|
+
return search_task.get_progress(self._get_trace_id())
|
|
2393
2436
|
|
|
2394
2437
|
def display_transactional_transform_api(self, only_online_sources=False):
|
|
2395
2438
|
if self.api_key is None:
|
|
@@ -2416,7 +2459,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2416
2459
|
return "12345678"
|
|
2417
2460
|
return "test_value"
|
|
2418
2461
|
|
|
2419
|
-
file_metadata = self._search_task.get_file_metadata(
|
|
2462
|
+
file_metadata = self._search_task.get_file_metadata(time.time_ns())
|
|
2420
2463
|
|
|
2421
2464
|
def get_column_meta(column_name: str) -> FileColumnMetadata:
|
|
2422
2465
|
for c in file_metadata.columns:
|
|
@@ -2490,7 +2533,6 @@ if response.status_code == 200:
|
|
|
2490
2533
|
|
|
2491
2534
|
def __inner_transform(
|
|
2492
2535
|
self,
|
|
2493
|
-
trace_id: str,
|
|
2494
2536
|
X: pd.DataFrame,
|
|
2495
2537
|
*,
|
|
2496
2538
|
y: pd.Series | None = None,
|
|
@@ -2509,174 +2551,133 @@ if response.status_code == 200:
|
|
|
2509
2551
|
raise NotFittedError(self.bundle.get("transform_unfitted_enricher"))
|
|
2510
2552
|
|
|
2511
2553
|
start_time = time.time()
|
|
2512
|
-
|
|
2513
|
-
with MDC(trace_id=trace_id, search_id=search_id):
|
|
2514
|
-
self.logger.info("Start transform")
|
|
2554
|
+
self.logger.info("Start transform")
|
|
2515
2555
|
|
|
2516
|
-
|
|
2517
|
-
X, y, eval_set=None, is_transform=True, silent=True
|
|
2518
|
-
)
|
|
2519
|
-
df = self.__combine_train_and_eval_sets(validated_X, validated_y, validated_eval_set)
|
|
2556
|
+
search_keys = self.search_keys.copy()
|
|
2520
2557
|
|
|
2521
|
-
|
|
2558
|
+
self.__validate_search_keys(search_keys, self.search_id)
|
|
2522
2559
|
|
|
2523
|
-
|
|
2560
|
+
validated_X, validated_y, validated_eval_set = self._validate_train_eval(X, y, eval_set=None, is_transform=True)
|
|
2561
|
+
df = self.__combine_train_and_eval_sets(validated_X, validated_y, validated_eval_set)
|
|
2524
2562
|
|
|
2525
|
-
|
|
2526
|
-
if len(self.feature_names_) == 0:
|
|
2527
|
-
msg = self.bundle.get("no_important_features_for_transform")
|
|
2528
|
-
self.__log_warning(msg, show_support_link=True)
|
|
2529
|
-
return None, {}, [], self.search_keys
|
|
2563
|
+
validated_Xy = df.copy()
|
|
2530
2564
|
|
|
2531
|
-
|
|
2565
|
+
self.__log_debug_information(validated_X, validated_y, exclude_features_sources=exclude_features_sources)
|
|
2532
2566
|
|
|
2533
|
-
|
|
2534
|
-
|
|
2535
|
-
|
|
2536
|
-
|
|
2537
|
-
|
|
2567
|
+
# If there are no important features, return original dataframe
|
|
2568
|
+
if len(self.feature_names_) == 0:
|
|
2569
|
+
msg = self.bundle.get("no_important_features_for_transform")
|
|
2570
|
+
self.__log_warning(msg, show_support_link=True)
|
|
2571
|
+
return None, {}, [], search_keys
|
|
2538
2572
|
|
|
2539
|
-
|
|
2540
|
-
|
|
2541
|
-
|
|
2542
|
-
|
|
2543
|
-
|
|
2544
|
-
msg = self.bundle.get("online_api_features_transform").format(online_api_features)
|
|
2545
|
-
self.logger.warning(msg)
|
|
2546
|
-
print(msg)
|
|
2547
|
-
self.display_transactional_transform_api(only_online_sources=True)
|
|
2548
|
-
|
|
2549
|
-
if not metrics_calculation:
|
|
2550
|
-
transform_usage = self.rest_client.get_current_transform_usage(trace_id)
|
|
2551
|
-
self.logger.info(f"Current transform usage: {transform_usage}. Transforming {len(X)} rows")
|
|
2552
|
-
if transform_usage.has_limit:
|
|
2553
|
-
if len(X) > transform_usage.rest_rows:
|
|
2554
|
-
rest_rows = max(transform_usage.rest_rows, 0)
|
|
2555
|
-
msg = self.bundle.get("transform_usage_warning").format(len(X), rest_rows)
|
|
2556
|
-
self.logger.warning(msg)
|
|
2557
|
-
print(msg)
|
|
2558
|
-
show_request_quote_button()
|
|
2559
|
-
return None, {}, [], {}
|
|
2560
|
-
else:
|
|
2561
|
-
msg = self.bundle.get("transform_usage_info").format(
|
|
2562
|
-
transform_usage.limit, transform_usage.transformed_rows
|
|
2563
|
-
)
|
|
2564
|
-
self.logger.info(msg)
|
|
2565
|
-
print(msg)
|
|
2573
|
+
if self._has_paid_features(exclude_features_sources):
|
|
2574
|
+
msg = self.bundle.get("transform_with_paid_features")
|
|
2575
|
+
self.logger.warning(msg)
|
|
2576
|
+
self.__display_support_link(msg)
|
|
2577
|
+
return None, {}, [], search_keys
|
|
2566
2578
|
|
|
2567
|
-
|
|
2579
|
+
online_api_features = [fm.name for fm in features_meta if fm.from_online_api and fm.shap_value > 0]
|
|
2580
|
+
if len(online_api_features) > 0:
|
|
2581
|
+
self.logger.warning(
|
|
2582
|
+
f"There are important features for transform, that generated by online API: {online_api_features}"
|
|
2583
|
+
)
|
|
2584
|
+
msg = self.bundle.get("online_api_features_transform").format(online_api_features)
|
|
2585
|
+
self.logger.warning(msg)
|
|
2586
|
+
print(msg)
|
|
2587
|
+
self.display_transactional_transform_api(only_online_sources=True)
|
|
2588
|
+
|
|
2589
|
+
if not metrics_calculation:
|
|
2590
|
+
transform_usage = self.rest_client.get_current_transform_usage(self._get_trace_id())
|
|
2591
|
+
self.logger.info(f"Current transform usage: {transform_usage}. Transforming {len(X)} rows")
|
|
2592
|
+
if transform_usage.has_limit:
|
|
2593
|
+
if len(X) > transform_usage.rest_rows:
|
|
2594
|
+
rest_rows = max(transform_usage.rest_rows, 0)
|
|
2595
|
+
bundle_msg = (
|
|
2596
|
+
"transform_usage_warning_registered" if self.__is_registered else "transform_usage_warning_demo"
|
|
2597
|
+
)
|
|
2598
|
+
msg = self.bundle.get(bundle_msg).format(rest_rows, len(X))
|
|
2599
|
+
self.logger.warning(msg)
|
|
2600
|
+
print(msg)
|
|
2601
|
+
show_request_quote_button(is_registered=self.__is_registered)
|
|
2602
|
+
return None, {}, [], {}
|
|
2603
|
+
else:
|
|
2604
|
+
msg = self.bundle.get("transform_usage_info").format(
|
|
2605
|
+
transform_usage.limit, transform_usage.transformed_rows
|
|
2606
|
+
)
|
|
2607
|
+
self.logger.info(msg)
|
|
2608
|
+
print(msg)
|
|
2568
2609
|
|
|
2569
|
-
|
|
2570
|
-
c for c in df.columns if c in self.feature_names_ and c in self.external_source_feature_names
|
|
2571
|
-
]
|
|
2572
|
-
if len(columns_to_drop) > 0:
|
|
2573
|
-
msg = self.bundle.get("x_contains_enriching_columns").format(columns_to_drop)
|
|
2574
|
-
self.logger.warning(msg)
|
|
2575
|
-
print(msg)
|
|
2576
|
-
df = df.drop(columns=columns_to_drop)
|
|
2610
|
+
is_demo_dataset = hash_input(df) in DEMO_DATASET_HASHES
|
|
2577
2611
|
|
|
2578
|
-
|
|
2579
|
-
|
|
2580
|
-
|
|
2581
|
-
|
|
2582
|
-
|
|
2612
|
+
columns_to_drop = [
|
|
2613
|
+
c for c in df.columns if c in self.feature_names_ and c in self.external_source_feature_names
|
|
2614
|
+
]
|
|
2615
|
+
if len(columns_to_drop) > 0:
|
|
2616
|
+
msg = self.bundle.get("x_contains_enriching_columns").format(columns_to_drop)
|
|
2617
|
+
self.logger.warning(msg)
|
|
2618
|
+
print(msg)
|
|
2619
|
+
df = df.drop(columns=columns_to_drop)
|
|
2583
2620
|
|
|
2584
|
-
|
|
2585
|
-
|
|
2586
|
-
)
|
|
2621
|
+
if self.id_columns is not None and self.cv is not None and self.cv.is_time_series():
|
|
2622
|
+
search_keys.update({col: SearchKey.CUSTOM_KEY for col in self.id_columns if col not in search_keys})
|
|
2587
2623
|
|
|
2588
|
-
|
|
2624
|
+
search_keys = self.__prepare_search_keys(
|
|
2625
|
+
df, search_keys, is_demo_dataset, is_transform=True, silent_mode=silent_mode
|
|
2626
|
+
)
|
|
2589
2627
|
|
|
2590
|
-
|
|
2591
|
-
msg = self.bundle.get("unsupported_index_column")
|
|
2592
|
-
self.logger.info(msg)
|
|
2593
|
-
print(msg)
|
|
2594
|
-
df.drop(columns=DEFAULT_INDEX, inplace=True)
|
|
2595
|
-
validated_Xy.drop(columns=DEFAULT_INDEX, inplace=True)
|
|
2628
|
+
df = self.__handle_index_search_keys(df, search_keys)
|
|
2596
2629
|
|
|
2597
|
-
|
|
2630
|
+
if DEFAULT_INDEX in df.columns:
|
|
2631
|
+
msg = self.bundle.get("unsupported_index_column")
|
|
2632
|
+
self.logger.info(msg)
|
|
2633
|
+
print(msg)
|
|
2634
|
+
df.drop(columns=DEFAULT_INDEX, inplace=True)
|
|
2635
|
+
validated_Xy.drop(columns=DEFAULT_INDEX, inplace=True)
|
|
2598
2636
|
|
|
2599
|
-
|
|
2600
|
-
date_column = self._get_date_column(search_keys)
|
|
2601
|
-
if date_column is not None:
|
|
2602
|
-
converter = DateTimeSearchKeyConverter(
|
|
2603
|
-
date_column,
|
|
2604
|
-
self.date_format,
|
|
2605
|
-
self.logger,
|
|
2606
|
-
bundle=self.bundle,
|
|
2607
|
-
generate_cyclical_features=self.generate_search_key_features,
|
|
2608
|
-
)
|
|
2609
|
-
df = converter.convert(df, keep_time=True)
|
|
2610
|
-
self.logger.info(f"Date column after convertion: {df[date_column]}")
|
|
2611
|
-
generated_features.extend(converter.generated_features)
|
|
2612
|
-
else:
|
|
2613
|
-
self.logger.info("Input dataset hasn't date column")
|
|
2614
|
-
if self.__should_add_date_column():
|
|
2615
|
-
df = self._add_current_date_as_key(df, search_keys, self.bundle, silent=True)
|
|
2616
|
-
|
|
2617
|
-
email_columns = SearchKey.find_all_keys(search_keys, SearchKey.EMAIL)
|
|
2618
|
-
if email_columns and self.generate_search_key_features:
|
|
2619
|
-
generator = EmailDomainGenerator(email_columns)
|
|
2620
|
-
df = generator.generate(df)
|
|
2621
|
-
generated_features.extend(generator.generated_features)
|
|
2622
|
-
|
|
2623
|
-
normalizer = Normalizer(self.bundle, self.logger)
|
|
2624
|
-
df, search_keys, generated_features = normalizer.normalize(df, search_keys, generated_features)
|
|
2625
|
-
columns_renaming = normalizer.columns_renaming
|
|
2626
|
-
|
|
2627
|
-
# If there are no external features, we don't call backend on transform
|
|
2628
|
-
external_features = [fm for fm in features_meta if fm.shap_value > 0 and fm.source != "etalon"]
|
|
2629
|
-
if len(external_features) == 0:
|
|
2630
|
-
self.logger.warning(
|
|
2631
|
-
"No external features found, returning original dataframe"
|
|
2632
|
-
f" with generated important features: {self.feature_names_}"
|
|
2633
|
-
)
|
|
2634
|
-
df = df.rename(columns=columns_renaming)
|
|
2635
|
-
generated_features = [columns_renaming.get(c, c) for c in generated_features]
|
|
2636
|
-
search_keys = {columns_renaming.get(c, c): t for c, t in search_keys.items()}
|
|
2637
|
-
selecting_columns = self._selecting_input_and_generated_columns(
|
|
2638
|
-
validated_Xy, generated_features, keep_input, trace_id
|
|
2639
|
-
)
|
|
2640
|
-
self.logger.warning(f"Filtered columns by existance in dataframe: {selecting_columns}")
|
|
2641
|
-
if add_fit_system_record_id:
|
|
2642
|
-
df = self._add_fit_system_record_id(
|
|
2643
|
-
df,
|
|
2644
|
-
search_keys,
|
|
2645
|
-
SYSTEM_RECORD_ID,
|
|
2646
|
-
TARGET,
|
|
2647
|
-
columns_renaming,
|
|
2648
|
-
self.id_columns,
|
|
2649
|
-
self.cv,
|
|
2650
|
-
self.model_task_type,
|
|
2651
|
-
self.logger,
|
|
2652
|
-
self.bundle,
|
|
2653
|
-
)
|
|
2654
|
-
selecting_columns.append(SYSTEM_RECORD_ID)
|
|
2655
|
-
return df[selecting_columns], columns_renaming, generated_features, search_keys
|
|
2656
|
-
|
|
2657
|
-
# Don't pass all features in backend on transform
|
|
2658
|
-
runtime_parameters = self._get_copy_of_runtime_parameters()
|
|
2659
|
-
features_for_transform = self._search_task.get_features_for_transform() or []
|
|
2660
|
-
if len(features_for_transform) > 0:
|
|
2661
|
-
missing_features_for_transform = [
|
|
2662
|
-
columns_renaming.get(f) or f for f in features_for_transform if f not in df.columns
|
|
2663
|
-
]
|
|
2664
|
-
if TARGET in missing_features_for_transform:
|
|
2665
|
-
raise ValidationError(self.bundle.get("missing_target_for_transform"))
|
|
2637
|
+
df = self.__add_country_code(df, search_keys)
|
|
2666
2638
|
|
|
2667
|
-
|
|
2668
|
-
|
|
2669
|
-
|
|
2670
|
-
|
|
2671
|
-
|
|
2639
|
+
generated_features = []
|
|
2640
|
+
date_column = self._get_date_column(search_keys)
|
|
2641
|
+
if date_column is not None:
|
|
2642
|
+
converter = DateTimeConverter(
|
|
2643
|
+
date_column,
|
|
2644
|
+
self.date_format,
|
|
2645
|
+
self.logger,
|
|
2646
|
+
bundle=self.bundle,
|
|
2647
|
+
generate_cyclical_features=self.generate_search_key_features,
|
|
2648
|
+
)
|
|
2649
|
+
df = converter.convert(df, keep_time=True)
|
|
2650
|
+
self.logger.info(f"Date column after convertion: {df[date_column]}")
|
|
2651
|
+
generated_features.extend(converter.generated_features)
|
|
2652
|
+
else:
|
|
2653
|
+
self.logger.info("Input dataset hasn't date column")
|
|
2654
|
+
if self.__should_add_date_column():
|
|
2655
|
+
df = self._add_current_date_as_key(df, search_keys, self.bundle, silent=True)
|
|
2672
2656
|
|
|
2673
|
-
|
|
2657
|
+
email_columns = SearchKey.find_all_keys(search_keys, SearchKey.EMAIL)
|
|
2658
|
+
if email_columns and self.generate_search_key_features:
|
|
2659
|
+
generator = EmailDomainGenerator(email_columns)
|
|
2660
|
+
df = generator.generate(df)
|
|
2661
|
+
generated_features.extend(generator.generated_features)
|
|
2674
2662
|
|
|
2675
|
-
|
|
2676
|
-
|
|
2677
|
-
|
|
2663
|
+
normalizer = Normalizer(self.bundle, self.logger)
|
|
2664
|
+
df, search_keys, generated_features = normalizer.normalize(df, search_keys, generated_features)
|
|
2665
|
+
columns_renaming = normalizer.columns_renaming
|
|
2678
2666
|
|
|
2679
|
-
|
|
2667
|
+
# If there are no external features, we don't call backend on transform
|
|
2668
|
+
external_features = [fm for fm in features_meta if fm.shap_value > 0 and fm.source != "etalon"]
|
|
2669
|
+
if len(external_features) == 0:
|
|
2670
|
+
self.logger.warning(
|
|
2671
|
+
"No external features found, returning original dataframe"
|
|
2672
|
+
f" with generated important features: {self.feature_names_}"
|
|
2673
|
+
)
|
|
2674
|
+
df = df.rename(columns=columns_renaming)
|
|
2675
|
+
generated_features = [columns_renaming.get(c, c) for c in generated_features]
|
|
2676
|
+
search_keys = {columns_renaming.get(c, c): t for c, t in search_keys.items()}
|
|
2677
|
+
selecting_columns = self._selecting_input_and_generated_columns(
|
|
2678
|
+
validated_Xy, generated_features, keep_input, is_transform=True
|
|
2679
|
+
)
|
|
2680
|
+
self.logger.warning(f"Filtered columns by existance in dataframe: {selecting_columns}")
|
|
2680
2681
|
if add_fit_system_record_id:
|
|
2681
2682
|
df = self._add_fit_system_record_id(
|
|
2682
2683
|
df,
|
|
@@ -2690,86 +2691,144 @@ if response.status_code == 200:
|
|
|
2690
2691
|
self.logger,
|
|
2691
2692
|
self.bundle,
|
|
2692
2693
|
)
|
|
2693
|
-
|
|
2694
|
-
|
|
2694
|
+
selecting_columns.append(SYSTEM_RECORD_ID)
|
|
2695
|
+
return df[selecting_columns], columns_renaming, generated_features, search_keys
|
|
2695
2696
|
|
|
2696
|
-
|
|
2697
|
-
|
|
2698
|
-
|
|
2697
|
+
# Don't pass all features in backend on transform
|
|
2698
|
+
runtime_parameters = self._get_copy_of_runtime_parameters()
|
|
2699
|
+
features_for_transform = self._search_task.get_features_for_transform()
|
|
2700
|
+
if features_for_transform:
|
|
2701
|
+
missing_features_for_transform = [
|
|
2702
|
+
columns_renaming.get(f) or f for f in features_for_transform if f not in df.columns
|
|
2703
|
+
]
|
|
2704
|
+
if TARGET in missing_features_for_transform:
|
|
2705
|
+
raise ValidationError(self.bundle.get("missing_target_for_transform"))
|
|
2699
2706
|
|
|
2700
|
-
|
|
2707
|
+
if len(missing_features_for_transform) > 0:
|
|
2708
|
+
raise ValidationError(
|
|
2709
|
+
self.bundle.get("missing_features_for_transform").format(missing_features_for_transform)
|
|
2710
|
+
)
|
|
2711
|
+
features_for_embeddings = self._search_task.get_features_for_embeddings()
|
|
2712
|
+
if features_for_embeddings:
|
|
2713
|
+
runtime_parameters.properties["features_for_embeddings"] = ",".join(features_for_embeddings)
|
|
2714
|
+
features_for_transform = [f for f in features_for_transform if f not in search_keys.keys()]
|
|
2701
2715
|
|
|
2702
|
-
|
|
2703
|
-
df, unnest_search_keys = self._explode_multiple_search_keys(df, search_keys, columns_renaming)
|
|
2716
|
+
columns_for_system_record_id = sorted(list(search_keys.keys()) + features_for_transform)
|
|
2704
2717
|
|
|
2705
|
-
|
|
2718
|
+
df[ENTITY_SYSTEM_RECORD_ID] = pd.util.hash_pandas_object(df[columns_for_system_record_id], index=False).astype(
|
|
2719
|
+
"float64"
|
|
2720
|
+
)
|
|
2706
2721
|
|
|
2707
|
-
|
|
2708
|
-
|
|
2709
|
-
|
|
2710
|
-
|
|
2711
|
-
|
|
2712
|
-
|
|
2713
|
-
|
|
2714
|
-
|
|
2715
|
-
|
|
2716
|
-
|
|
2717
|
-
|
|
2718
|
-
|
|
2722
|
+
features_not_to_pass = []
|
|
2723
|
+
if add_fit_system_record_id:
|
|
2724
|
+
df = self._add_fit_system_record_id(
|
|
2725
|
+
df,
|
|
2726
|
+
search_keys,
|
|
2727
|
+
SYSTEM_RECORD_ID,
|
|
2728
|
+
TARGET,
|
|
2729
|
+
columns_renaming,
|
|
2730
|
+
self.id_columns,
|
|
2731
|
+
self.cv,
|
|
2732
|
+
self.model_task_type,
|
|
2733
|
+
self.logger,
|
|
2734
|
+
self.bundle,
|
|
2735
|
+
)
|
|
2736
|
+
df = df.rename(columns={SYSTEM_RECORD_ID: SORT_ID})
|
|
2737
|
+
features_not_to_pass.append(SORT_ID)
|
|
2719
2738
|
|
|
2720
|
-
|
|
2721
|
-
|
|
2722
|
-
|
|
2723
|
-
ip_column,
|
|
2724
|
-
search_keys,
|
|
2725
|
-
columns_renaming,
|
|
2726
|
-
list(unnest_search_keys.keys()),
|
|
2727
|
-
self.bundle,
|
|
2728
|
-
self.logger,
|
|
2729
|
-
)
|
|
2730
|
-
df = converter.convert(df)
|
|
2739
|
+
system_columns_with_original_index = [ENTITY_SYSTEM_RECORD_ID] + generated_features
|
|
2740
|
+
if add_fit_system_record_id:
|
|
2741
|
+
system_columns_with_original_index.append(SORT_ID)
|
|
2731
2742
|
|
|
2732
|
-
|
|
2733
|
-
meaning_types.update({col: FileColumnMeaningType.FEATURE for col in features_for_transform})
|
|
2734
|
-
meaning_types.update({col: key.value for col, key in search_keys.items()})
|
|
2743
|
+
df_before_explode = df[system_columns_with_original_index].copy()
|
|
2735
2744
|
|
|
2736
|
-
|
|
2737
|
-
|
|
2738
|
-
c
|
|
2739
|
-
for c in df.columns
|
|
2740
|
-
if c not in search_keys.keys()
|
|
2741
|
-
and c not in features_for_transform
|
|
2742
|
-
and c not in [ENTITY_SYSTEM_RECORD_ID, SEARCH_KEY_UNNEST]
|
|
2743
|
-
]
|
|
2744
|
-
)
|
|
2745
|
+
# Explode multiple search keys
|
|
2746
|
+
df, unnest_search_keys = self._explode_multiple_search_keys(df, search_keys, columns_renaming)
|
|
2745
2747
|
|
|
2746
|
-
|
|
2747
|
-
df = df.drop(columns=DateTimeSearchKeyConverter.DATETIME_COL)
|
|
2748
|
+
# Convert search keys and generate features on them
|
|
2748
2749
|
|
|
2749
|
-
|
|
2750
|
-
|
|
2751
|
-
|
|
2752
|
-
|
|
2750
|
+
email_column = self._get_email_column(search_keys)
|
|
2751
|
+
hem_column = self._get_hem_column(search_keys)
|
|
2752
|
+
if email_column:
|
|
2753
|
+
converter = EmailSearchKeyConverter(
|
|
2754
|
+
email_column,
|
|
2755
|
+
hem_column,
|
|
2756
|
+
search_keys,
|
|
2757
|
+
columns_renaming,
|
|
2758
|
+
list(unnest_search_keys.keys()),
|
|
2759
|
+
self.logger,
|
|
2753
2760
|
)
|
|
2754
|
-
|
|
2755
|
-
meaning_types[ENTITY_SYSTEM_RECORD_ID] = FileColumnMeaningType.ENTITY_SYSTEM_RECORD_ID
|
|
2756
|
-
if SEARCH_KEY_UNNEST in df.columns:
|
|
2757
|
-
meaning_types[SEARCH_KEY_UNNEST] = FileColumnMeaningType.UNNEST_KEY
|
|
2761
|
+
df = converter.convert(df)
|
|
2758
2762
|
|
|
2759
|
-
|
|
2763
|
+
ip_column = self._get_ip_column(search_keys)
|
|
2764
|
+
if ip_column:
|
|
2765
|
+
converter = IpSearchKeyConverter(
|
|
2766
|
+
ip_column,
|
|
2767
|
+
search_keys,
|
|
2768
|
+
columns_renaming,
|
|
2769
|
+
list(unnest_search_keys.keys()),
|
|
2770
|
+
self.bundle,
|
|
2771
|
+
self.logger,
|
|
2772
|
+
)
|
|
2773
|
+
df = converter.convert(df)
|
|
2760
2774
|
|
|
2761
|
-
|
|
2775
|
+
date_features = []
|
|
2776
|
+
for col in features_for_transform:
|
|
2777
|
+
if DateTimeConverter(col).is_datetime(df):
|
|
2778
|
+
df[col] = DateTimeConverter(col).to_date_string(df)
|
|
2779
|
+
date_features.append(col)
|
|
2780
|
+
|
|
2781
|
+
meaning_types = {}
|
|
2782
|
+
meaning_types.update(
|
|
2783
|
+
{
|
|
2784
|
+
col: FileColumnMeaningType.FEATURE
|
|
2785
|
+
for col in features_for_transform
|
|
2786
|
+
if col not in date_features and col not in generated_features
|
|
2787
|
+
}
|
|
2788
|
+
)
|
|
2789
|
+
meaning_types.update({col: FileColumnMeaningType.GENERATED_FEATURE for col in generated_features})
|
|
2790
|
+
meaning_types.update({col: FileColumnMeaningType.DATE_FEATURE for col in date_features})
|
|
2791
|
+
meaning_types.update({col: key.value for col, key in search_keys.items()})
|
|
2762
2792
|
|
|
2763
|
-
|
|
2793
|
+
features_not_to_pass.extend(
|
|
2794
|
+
[
|
|
2795
|
+
c
|
|
2796
|
+
for c in df.columns
|
|
2797
|
+
if c not in search_keys.keys()
|
|
2798
|
+
and c not in features_for_transform
|
|
2799
|
+
and c not in [ENTITY_SYSTEM_RECORD_ID, SEARCH_KEY_UNNEST]
|
|
2800
|
+
]
|
|
2801
|
+
)
|
|
2764
2802
|
|
|
2765
|
-
|
|
2766
|
-
|
|
2767
|
-
)
|
|
2768
|
-
if not silent_mode and full_duplicates_warning:
|
|
2769
|
-
self.__log_warning(full_duplicates_warning)
|
|
2803
|
+
if DateTimeConverter.DATETIME_COL in df.columns:
|
|
2804
|
+
df = df.drop(columns=DateTimeConverter.DATETIME_COL)
|
|
2770
2805
|
|
|
2771
|
-
|
|
2772
|
-
|
|
2806
|
+
# search keys might be changed after explode
|
|
2807
|
+
columns_for_system_record_id = sorted(list(search_keys.keys()) + features_for_transform)
|
|
2808
|
+
df[SYSTEM_RECORD_ID] = pd.util.hash_pandas_object(df[columns_for_system_record_id], index=False).astype(
|
|
2809
|
+
"float64"
|
|
2810
|
+
)
|
|
2811
|
+
meaning_types[SYSTEM_RECORD_ID] = FileColumnMeaningType.SYSTEM_RECORD_ID
|
|
2812
|
+
meaning_types[ENTITY_SYSTEM_RECORD_ID] = FileColumnMeaningType.ENTITY_SYSTEM_RECORD_ID
|
|
2813
|
+
if SEARCH_KEY_UNNEST in df.columns:
|
|
2814
|
+
meaning_types[SEARCH_KEY_UNNEST] = FileColumnMeaningType.UNNEST_KEY
|
|
2815
|
+
|
|
2816
|
+
df = df.reset_index(drop=True)
|
|
2817
|
+
|
|
2818
|
+
combined_search_keys = combine_search_keys(search_keys.keys())
|
|
2819
|
+
|
|
2820
|
+
df_without_features = df.drop(columns=features_not_to_pass, errors="ignore")
|
|
2821
|
+
|
|
2822
|
+
df_without_features, full_duplicates_warning = clean_full_duplicates(
|
|
2823
|
+
df_without_features, is_transform=True, logger=self.logger, bundle=self.bundle
|
|
2824
|
+
)
|
|
2825
|
+
if not silent_mode and full_duplicates_warning:
|
|
2826
|
+
self.__log_warning(full_duplicates_warning)
|
|
2827
|
+
|
|
2828
|
+
del df
|
|
2829
|
+
gc.collect()
|
|
2830
|
+
|
|
2831
|
+
def invoke_validation(df: pd.DataFrame):
|
|
2773
2832
|
|
|
2774
2833
|
dataset = Dataset(
|
|
2775
2834
|
"sample_" + str(uuid.uuid4()),
|
|
@@ -2789,7 +2848,7 @@ if response.status_code == 200:
|
|
|
2789
2848
|
dataset.columns_renaming = columns_renaming
|
|
2790
2849
|
|
|
2791
2850
|
validation_task = self._search_task.validation(
|
|
2792
|
-
|
|
2851
|
+
self._get_trace_id(),
|
|
2793
2852
|
dataset,
|
|
2794
2853
|
start_time=start_time,
|
|
2795
2854
|
extract_features=True,
|
|
@@ -2801,7 +2860,7 @@ if response.status_code == 200:
|
|
|
2801
2860
|
progress_callback=progress_callback,
|
|
2802
2861
|
)
|
|
2803
2862
|
|
|
2804
|
-
del
|
|
2863
|
+
del df, dataset
|
|
2805
2864
|
gc.collect()
|
|
2806
2865
|
|
|
2807
2866
|
if not silent_mode:
|
|
@@ -2809,7 +2868,7 @@ if response.status_code == 200:
|
|
|
2809
2868
|
if not self.__is_registered:
|
|
2810
2869
|
print(self.bundle.get("polling_unregister_information"))
|
|
2811
2870
|
|
|
2812
|
-
progress = self.get_progress(
|
|
2871
|
+
progress = self.get_progress(validation_task)
|
|
2813
2872
|
progress.recalculate_eta(time.time() - start_time)
|
|
2814
2873
|
if progress_bar is not None:
|
|
2815
2874
|
progress_bar.progress = progress.to_progress_bar()
|
|
@@ -2831,15 +2890,15 @@ if response.status_code == 200:
|
|
|
2831
2890
|
if progress.stage == ProgressStage.FAILED.value:
|
|
2832
2891
|
raise Exception(progress.error_message)
|
|
2833
2892
|
time.sleep(polling_period_seconds)
|
|
2834
|
-
progress = self.get_progress(
|
|
2893
|
+
progress = self.get_progress(validation_task)
|
|
2835
2894
|
except KeyboardInterrupt as e:
|
|
2836
2895
|
print(self.bundle.get("search_stopping"))
|
|
2837
|
-
self.rest_client.stop_search_task_v2(
|
|
2896
|
+
self.rest_client.stop_search_task_v2(self._get_trace_id(), validation_task.search_task_id)
|
|
2838
2897
|
self.logger.warning(f"Search {validation_task.search_task_id} stopped by user")
|
|
2839
2898
|
print(self.bundle.get("search_stopped"))
|
|
2840
2899
|
raise e
|
|
2841
2900
|
|
|
2842
|
-
validation_task.poll_result(
|
|
2901
|
+
validation_task.poll_result(self._get_trace_id(), quiet=True)
|
|
2843
2902
|
|
|
2844
2903
|
seconds_left = time.time() - start_time
|
|
2845
2904
|
progress = SearchProgress(97.0, ProgressStage.DOWNLOADING, seconds_left)
|
|
@@ -2851,96 +2910,118 @@ if response.status_code == 200:
|
|
|
2851
2910
|
if not silent_mode:
|
|
2852
2911
|
print(self.bundle.get("transform_start"))
|
|
2853
2912
|
|
|
2854
|
-
|
|
2855
|
-
df_before_explode = df_before_explode.rename(columns=columns_renaming)
|
|
2856
|
-
generated_features = [columns_renaming.get(c, c) for c in generated_features]
|
|
2857
|
-
combined_df = pd.concat(
|
|
2858
|
-
[
|
|
2859
|
-
validated_Xy.reset_index(drop=True),
|
|
2860
|
-
df_before_explode.reset_index(drop=True),
|
|
2861
|
-
],
|
|
2862
|
-
axis=1,
|
|
2863
|
-
).set_index(validated_Xy.index)
|
|
2864
|
-
|
|
2865
|
-
result_features = validation_task.get_all_validation_raw_features(trace_id, metrics_calculation)
|
|
2866
|
-
|
|
2867
|
-
result = self.__enrich(
|
|
2868
|
-
combined_df,
|
|
2869
|
-
result_features,
|
|
2870
|
-
how="left",
|
|
2871
|
-
)
|
|
2913
|
+
return validation_task.get_all_validation_raw_features(self._get_trace_id(), metrics_calculation)
|
|
2872
2914
|
|
|
2873
|
-
|
|
2874
|
-
|
|
2875
|
-
|
|
2876
|
-
|
|
2877
|
-
|
|
2878
|
-
for c in result.columns
|
|
2879
|
-
if c in self.feature_names_ and c not in selecting_columns and c not in validated_Xy.columns
|
|
2915
|
+
if len(df_without_features) <= Dataset.MAX_ROWS:
|
|
2916
|
+
result_features = invoke_validation(df_without_features)
|
|
2917
|
+
else:
|
|
2918
|
+
self.logger.warning(
|
|
2919
|
+
f"Dataset has more than {Dataset.MAX_ROWS} rows, splitting into chunks of {Dataset.MAX_ROWS} rows"
|
|
2880
2920
|
)
|
|
2881
|
-
|
|
2882
|
-
selecting_columns.append(SORT_ID)
|
|
2921
|
+
result_features_list = []
|
|
2883
2922
|
|
|
2884
|
-
|
|
2885
|
-
|
|
2886
|
-
|
|
2887
|
-
|
|
2888
|
-
if c in selecting_columns and c not in sorted_selecting_columns:
|
|
2889
|
-
sorted_selecting_columns.append(c)
|
|
2890
|
-
for c in result.columns:
|
|
2891
|
-
if c in selecting_columns and c not in sorted_selecting_columns:
|
|
2892
|
-
sorted_selecting_columns.append(c)
|
|
2923
|
+
for i in range(0, len(df_without_features), Dataset.MAX_ROWS):
|
|
2924
|
+
chunk = df_without_features.iloc[i:i+Dataset.MAX_ROWS]
|
|
2925
|
+
result_features_list.append(invoke_validation(chunk))
|
|
2926
|
+
result_features = pd.concat(result_features_list)
|
|
2893
2927
|
|
|
2894
|
-
|
|
2928
|
+
# Prepare input DataFrame for __enrich by concatenating generated ids and client features
|
|
2929
|
+
df_before_explode = df_before_explode.rename(columns=columns_renaming)
|
|
2930
|
+
generated_features = [columns_renaming.get(c, c) for c in generated_features]
|
|
2931
|
+
combined_df = pd.concat(
|
|
2932
|
+
[
|
|
2933
|
+
validated_Xy.reset_index(drop=True),
|
|
2934
|
+
df_before_explode.reset_index(drop=True),
|
|
2935
|
+
],
|
|
2936
|
+
axis=1,
|
|
2937
|
+
).set_index(validated_Xy.index)
|
|
2938
|
+
|
|
2939
|
+
result = self.__enrich(
|
|
2940
|
+
combined_df,
|
|
2941
|
+
result_features,
|
|
2942
|
+
how="left",
|
|
2943
|
+
)
|
|
2895
2944
|
|
|
2896
|
-
|
|
2945
|
+
selecting_columns = self._selecting_input_and_generated_columns(
|
|
2946
|
+
validated_Xy, generated_features, keep_input, is_transform=True
|
|
2947
|
+
)
|
|
2948
|
+
selecting_columns.extend(
|
|
2949
|
+
c
|
|
2950
|
+
for c in result.columns
|
|
2951
|
+
if c in self.feature_names_ and c not in selecting_columns and c not in validated_Xy.columns
|
|
2952
|
+
)
|
|
2953
|
+
if add_fit_system_record_id:
|
|
2954
|
+
selecting_columns.append(SORT_ID)
|
|
2897
2955
|
|
|
2898
|
-
|
|
2899
|
-
|
|
2956
|
+
selecting_columns = list(set(selecting_columns))
|
|
2957
|
+
# sorting: first columns from X, then generated features, then enriched features
|
|
2958
|
+
sorted_selecting_columns = [c for c in validated_Xy.columns if c in selecting_columns]
|
|
2959
|
+
for c in generated_features:
|
|
2960
|
+
if c in selecting_columns and c not in sorted_selecting_columns:
|
|
2961
|
+
sorted_selecting_columns.append(c)
|
|
2962
|
+
for c in result.columns:
|
|
2963
|
+
if c in selecting_columns and c not in sorted_selecting_columns:
|
|
2964
|
+
sorted_selecting_columns.append(c)
|
|
2900
2965
|
|
|
2901
|
-
|
|
2902
|
-
result = result.rename(columns={SORT_ID: SYSTEM_RECORD_ID})
|
|
2966
|
+
self.logger.info(f"Transform sorted_selecting_columns: {sorted_selecting_columns}")
|
|
2903
2967
|
|
|
2904
|
-
|
|
2968
|
+
result = result[sorted_selecting_columns]
|
|
2969
|
+
|
|
2970
|
+
if self.country_added:
|
|
2971
|
+
result = result.drop(columns=COUNTRY, errors="ignore")
|
|
2972
|
+
|
|
2973
|
+
if add_fit_system_record_id:
|
|
2974
|
+
result = result.rename(columns={SORT_ID: SYSTEM_RECORD_ID})
|
|
2975
|
+
|
|
2976
|
+
return result, columns_renaming, generated_features, search_keys
|
|
2905
2977
|
|
|
2906
2978
|
def _selecting_input_and_generated_columns(
|
|
2907
2979
|
self,
|
|
2908
2980
|
validated_Xy: pd.DataFrame,
|
|
2909
2981
|
generated_features: list[str],
|
|
2910
2982
|
keep_input: bool,
|
|
2911
|
-
|
|
2983
|
+
is_transform: bool = False,
|
|
2912
2984
|
):
|
|
2913
|
-
|
|
2914
|
-
|
|
2915
|
-
|
|
2916
|
-
|
|
2917
|
-
|
|
2985
|
+
file_meta = self._search_task.get_file_metadata(self._get_trace_id())
|
|
2986
|
+
fit_dropped_features = self.fit_dropped_features or file_meta.droppedColumns or []
|
|
2987
|
+
fit_input_columns = [c.originalName for c in file_meta.columns]
|
|
2988
|
+
original_dropped_features = [self.fit_columns_renaming.get(c, c) for c in fit_dropped_features]
|
|
2989
|
+
new_columns_on_transform = [
|
|
2990
|
+
c for c in validated_Xy.columns if c not in fit_input_columns and c not in original_dropped_features
|
|
2918
2991
|
]
|
|
2992
|
+
fit_original_search_keys = self._get_fit_search_keys_with_original_names()
|
|
2993
|
+
|
|
2994
|
+
selected_generated_features = [c for c in generated_features if c in self.feature_names_]
|
|
2919
2995
|
if keep_input is True:
|
|
2920
2996
|
selected_input_columns = [
|
|
2921
2997
|
c
|
|
2922
2998
|
for c in validated_Xy.columns
|
|
2923
2999
|
if not self.fit_select_features
|
|
2924
3000
|
or c in self.feature_names_
|
|
2925
|
-
or c in new_columns_on_transform
|
|
2926
|
-
or c in
|
|
3001
|
+
or (c in new_columns_on_transform and is_transform)
|
|
3002
|
+
or c in fit_original_search_keys
|
|
2927
3003
|
or c in (self.id_columns or [])
|
|
2928
3004
|
or c in [EVAL_SET_INDEX, TARGET] # transform for metrics calculation
|
|
3005
|
+
or c == self.baseline_score_column
|
|
2929
3006
|
]
|
|
2930
3007
|
else:
|
|
2931
3008
|
selected_input_columns = []
|
|
2932
3009
|
|
|
3010
|
+
if DEFAULT_INDEX in selected_input_columns:
|
|
3011
|
+
selected_input_columns.remove(DEFAULT_INDEX)
|
|
3012
|
+
|
|
2933
3013
|
return selected_input_columns + selected_generated_features
|
|
2934
3014
|
|
|
2935
|
-
def
|
|
3015
|
+
def _validate_empty_search_keys(self, search_keys: dict[str, SearchKey], is_transform: bool = False):
|
|
2936
3016
|
if (search_keys is None or len(search_keys) == 0) and self.country_code is None:
|
|
2937
|
-
if
|
|
2938
|
-
self.logger.debug(
|
|
2939
|
-
return
|
|
3017
|
+
if is_transform:
|
|
3018
|
+
self.logger.debug("Transform started without search_keys")
|
|
3019
|
+
# return
|
|
2940
3020
|
else:
|
|
2941
3021
|
self.logger.warning("search_keys not provided")
|
|
2942
|
-
raise ValidationError(self.bundle.get("empty_search_keys"))
|
|
3022
|
+
# raise ValidationError(self.bundle.get("empty_search_keys"))
|
|
2943
3023
|
|
|
3024
|
+
def __validate_search_keys(self, search_keys: dict[str, SearchKey], search_id: str | None = None):
|
|
2944
3025
|
key_types = search_keys.values()
|
|
2945
3026
|
|
|
2946
3027
|
# Multiple search keys allowed only for PHONE, IP, POSTAL_CODE, EMAIL, HEM
|
|
@@ -3004,7 +3085,6 @@ if response.status_code == 200:
|
|
|
3004
3085
|
|
|
3005
3086
|
def __inner_fit(
|
|
3006
3087
|
self,
|
|
3007
|
-
trace_id: str,
|
|
3008
3088
|
X: pd.DataFrame | pd.Series | np.ndarray,
|
|
3009
3089
|
y: pd.DataFrame | pd.Series | np.ndarray | list | None,
|
|
3010
3090
|
eval_set: list[tuple] | None,
|
|
@@ -3086,8 +3166,10 @@ if response.status_code == 200:
|
|
|
3086
3166
|
df = self.__handle_index_search_keys(df, self.fit_search_keys)
|
|
3087
3167
|
self.fit_search_keys = self.__prepare_search_keys(df, self.fit_search_keys, is_demo_dataset)
|
|
3088
3168
|
|
|
3169
|
+
df = self._validate_OOT(df, self.fit_search_keys)
|
|
3170
|
+
|
|
3089
3171
|
maybe_date_column = SearchKey.find_key(self.fit_search_keys, [SearchKey.DATE, SearchKey.DATETIME])
|
|
3090
|
-
has_date = maybe_date_column is not None
|
|
3172
|
+
has_date = maybe_date_column is not None and maybe_date_column in validated_X.columns
|
|
3091
3173
|
|
|
3092
3174
|
self.model_task_type = self.model_task_type or define_task(validated_y, has_date, self.logger)
|
|
3093
3175
|
|
|
@@ -3114,7 +3196,7 @@ if response.status_code == 200:
|
|
|
3114
3196
|
|
|
3115
3197
|
if DEFAULT_INDEX in df.columns:
|
|
3116
3198
|
msg = self.bundle.get("unsupported_index_column")
|
|
3117
|
-
self.logger.
|
|
3199
|
+
self.logger.warning(msg)
|
|
3118
3200
|
print(msg)
|
|
3119
3201
|
self.fit_dropped_features.add(DEFAULT_INDEX)
|
|
3120
3202
|
df.drop(columns=DEFAULT_INDEX, inplace=True)
|
|
@@ -3124,7 +3206,7 @@ if response.status_code == 200:
|
|
|
3124
3206
|
self.fit_generated_features = []
|
|
3125
3207
|
|
|
3126
3208
|
if has_date:
|
|
3127
|
-
converter =
|
|
3209
|
+
converter = DateTimeConverter(
|
|
3128
3210
|
maybe_date_column,
|
|
3129
3211
|
self.date_format,
|
|
3130
3212
|
self.logger,
|
|
@@ -3170,15 +3252,19 @@ if response.status_code == 200:
|
|
|
3170
3252
|
df, self.fit_search_keys, self.fit_generated_features
|
|
3171
3253
|
)
|
|
3172
3254
|
self.fit_columns_renaming = normalizer.columns_renaming
|
|
3173
|
-
if normalizer.
|
|
3174
|
-
self.
|
|
3255
|
+
if normalizer.removed_datetime_features:
|
|
3256
|
+
self.fit_dropped_features.update(normalizer.removed_datetime_features)
|
|
3257
|
+
original_removed_datetime_features = [
|
|
3258
|
+
self.fit_columns_renaming.get(f, f) for f in normalizer.removed_datetime_features
|
|
3259
|
+
]
|
|
3260
|
+
self.__log_warning(self.bundle.get("dataset_date_features").format(original_removed_datetime_features))
|
|
3175
3261
|
|
|
3176
3262
|
non_feature_columns = [
|
|
3177
3263
|
self.TARGET_NAME,
|
|
3178
3264
|
EVAL_SET_INDEX,
|
|
3179
3265
|
] + list(self.fit_search_keys.keys())
|
|
3180
|
-
if
|
|
3181
|
-
non_feature_columns.append(
|
|
3266
|
+
if DateTimeConverter.DATETIME_COL in df.columns:
|
|
3267
|
+
non_feature_columns.append(DateTimeConverter.DATETIME_COL)
|
|
3182
3268
|
|
|
3183
3269
|
features_columns = [c for c in df.columns if c not in non_feature_columns]
|
|
3184
3270
|
|
|
@@ -3220,7 +3306,7 @@ if response.status_code == 200:
|
|
|
3220
3306
|
if fintech_warnings:
|
|
3221
3307
|
for fintech_warning in fintech_warnings:
|
|
3222
3308
|
self.__log_warning(fintech_warning)
|
|
3223
|
-
df, full_duplicates_warning = clean_full_duplicates(df, self.logger, bundle=self.bundle)
|
|
3309
|
+
df, full_duplicates_warning = clean_full_duplicates(df, logger=self.logger, bundle=self.bundle)
|
|
3224
3310
|
if full_duplicates_warning:
|
|
3225
3311
|
if len(df) == 0:
|
|
3226
3312
|
raise ValidationError(full_duplicates_warning)
|
|
@@ -3265,15 +3351,28 @@ if response.status_code == 200:
|
|
|
3265
3351
|
ENTITY_SYSTEM_RECORD_ID,
|
|
3266
3352
|
SEARCH_KEY_UNNEST,
|
|
3267
3353
|
] + list(self.fit_search_keys.keys())
|
|
3268
|
-
if
|
|
3269
|
-
non_feature_columns.append(
|
|
3354
|
+
if DateTimeConverter.DATETIME_COL in df.columns:
|
|
3355
|
+
non_feature_columns.append(DateTimeConverter.DATETIME_COL)
|
|
3270
3356
|
|
|
3271
3357
|
features_columns = [c for c in df.columns if c not in non_feature_columns]
|
|
3272
3358
|
|
|
3359
|
+
# find date features
|
|
3360
|
+
date_features = []
|
|
3361
|
+
for col in features_columns:
|
|
3362
|
+
if DateTimeConverter(col).is_datetime(df):
|
|
3363
|
+
df[col] = DateTimeConverter(col).to_date_string(df)
|
|
3364
|
+
date_features.append(col)
|
|
3365
|
+
|
|
3273
3366
|
meaning_types = {
|
|
3274
3367
|
**{col: key.value for col, key in self.fit_search_keys.items()},
|
|
3275
|
-
**{
|
|
3368
|
+
**{
|
|
3369
|
+
str(c): FileColumnMeaningType.FEATURE
|
|
3370
|
+
for c in df.columns
|
|
3371
|
+
if c not in non_feature_columns and c not in date_features and c not in self.fit_generated_features
|
|
3372
|
+
},
|
|
3276
3373
|
}
|
|
3374
|
+
meaning_types.update({col: FileColumnMeaningType.GENERATED_FEATURE for col in self.fit_generated_features})
|
|
3375
|
+
meaning_types.update({col: FileColumnMeaningType.DATE_FEATURE for col in date_features})
|
|
3277
3376
|
meaning_types[self.TARGET_NAME] = FileColumnMeaningType.TARGET
|
|
3278
3377
|
meaning_types[ENTITY_SYSTEM_RECORD_ID] = FileColumnMeaningType.ENTITY_SYSTEM_RECORD_ID
|
|
3279
3378
|
if SEARCH_KEY_UNNEST in df.columns:
|
|
@@ -3294,8 +3393,8 @@ if response.status_code == 200:
|
|
|
3294
3393
|
self.bundle,
|
|
3295
3394
|
)
|
|
3296
3395
|
|
|
3297
|
-
if
|
|
3298
|
-
df = df.drop(columns=
|
|
3396
|
+
if DateTimeConverter.DATETIME_COL in df.columns:
|
|
3397
|
+
df = df.drop(columns=DateTimeConverter.DATETIME_COL)
|
|
3299
3398
|
|
|
3300
3399
|
meaning_types[SYSTEM_RECORD_ID] = FileColumnMeaningType.SYSTEM_RECORD_ID
|
|
3301
3400
|
|
|
@@ -3320,6 +3419,8 @@ if response.status_code == 200:
|
|
|
3320
3419
|
cv_type=self.cv,
|
|
3321
3420
|
id_columns=self.__get_renamed_id_columns(),
|
|
3322
3421
|
is_imbalanced=self.imbalanced,
|
|
3422
|
+
dropped_columns=[self.fit_columns_renaming.get(f, f) for f in self.fit_dropped_features],
|
|
3423
|
+
autodetected_search_keys=self.autodetected_search_keys,
|
|
3323
3424
|
date_column=self._get_date_column(self.fit_search_keys),
|
|
3324
3425
|
date_format=self.date_format,
|
|
3325
3426
|
random_state=self.random_state,
|
|
@@ -3332,11 +3433,18 @@ if response.status_code == 200:
|
|
|
3332
3433
|
dataset.columns_renaming = self.fit_columns_renaming
|
|
3333
3434
|
|
|
3334
3435
|
self.passed_features = [
|
|
3335
|
-
column
|
|
3436
|
+
column
|
|
3437
|
+
for column, meaning_type in meaning_types.items()
|
|
3438
|
+
if meaning_type
|
|
3439
|
+
in [
|
|
3440
|
+
FileColumnMeaningType.FEATURE,
|
|
3441
|
+
FileColumnMeaningType.DATE_FEATURE,
|
|
3442
|
+
FileColumnMeaningType.GENERATED_FEATURE,
|
|
3443
|
+
]
|
|
3336
3444
|
]
|
|
3337
3445
|
|
|
3338
3446
|
self._search_task = dataset.search(
|
|
3339
|
-
trace_id=
|
|
3447
|
+
trace_id=self._get_trace_id(),
|
|
3340
3448
|
progress_bar=progress_bar,
|
|
3341
3449
|
start_time=start_time,
|
|
3342
3450
|
progress_callback=progress_callback,
|
|
@@ -3356,7 +3464,7 @@ if response.status_code == 200:
|
|
|
3356
3464
|
if not self.__is_registered:
|
|
3357
3465
|
print(self.bundle.get("polling_unregister_information"))
|
|
3358
3466
|
|
|
3359
|
-
progress = self.get_progress(
|
|
3467
|
+
progress = self.get_progress()
|
|
3360
3468
|
prev_progress = None
|
|
3361
3469
|
progress.recalculate_eta(time.time() - start_time)
|
|
3362
3470
|
if progress_bar is not None:
|
|
@@ -3382,16 +3490,16 @@ if response.status_code == 200:
|
|
|
3382
3490
|
)
|
|
3383
3491
|
raise RuntimeError(self.bundle.get("search_task_failed_status"))
|
|
3384
3492
|
time.sleep(poll_period_seconds)
|
|
3385
|
-
progress = self.get_progress(
|
|
3493
|
+
progress = self.get_progress()
|
|
3386
3494
|
except KeyboardInterrupt as e:
|
|
3387
3495
|
print(self.bundle.get("search_stopping"))
|
|
3388
|
-
self.rest_client.stop_search_task_v2(
|
|
3496
|
+
self.rest_client.stop_search_task_v2(self._get_trace_id(), self._search_task.search_task_id)
|
|
3389
3497
|
self.logger.warning(f"Search {self._search_task.search_task_id} stopped by user")
|
|
3390
3498
|
self._search_task = None
|
|
3391
3499
|
print(self.bundle.get("search_stopped"))
|
|
3392
3500
|
raise e
|
|
3393
3501
|
|
|
3394
|
-
self._search_task.poll_result(
|
|
3502
|
+
self._search_task.poll_result(self._get_trace_id(), quiet=True)
|
|
3395
3503
|
|
|
3396
3504
|
seconds_left = time.time() - start_time
|
|
3397
3505
|
progress = SearchProgress(97.0, ProgressStage.GENERATING_REPORT, seconds_left)
|
|
@@ -3420,10 +3528,9 @@ if response.status_code == 200:
|
|
|
3420
3528
|
msg = self.bundle.get("features_not_generated").format(unused_features_for_generation)
|
|
3421
3529
|
self.__log_warning(msg)
|
|
3422
3530
|
|
|
3423
|
-
self.__prepare_feature_importances(
|
|
3531
|
+
self.__prepare_feature_importances(df)
|
|
3424
3532
|
|
|
3425
3533
|
self._select_features_by_psi(
|
|
3426
|
-
trace_id=trace_id,
|
|
3427
3534
|
X=X,
|
|
3428
3535
|
y=y,
|
|
3429
3536
|
eval_set=eval_set,
|
|
@@ -3436,7 +3543,7 @@ if response.status_code == 200:
|
|
|
3436
3543
|
progress_callback=progress_callback,
|
|
3437
3544
|
)
|
|
3438
3545
|
|
|
3439
|
-
self.__prepare_feature_importances(
|
|
3546
|
+
self.__prepare_feature_importances(df)
|
|
3440
3547
|
|
|
3441
3548
|
self.__show_selected_features()
|
|
3442
3549
|
|
|
@@ -3471,7 +3578,6 @@ if response.status_code == 200:
|
|
|
3471
3578
|
scoring,
|
|
3472
3579
|
estimator,
|
|
3473
3580
|
remove_outliers_calc_metrics,
|
|
3474
|
-
trace_id,
|
|
3475
3581
|
progress_bar,
|
|
3476
3582
|
progress_callback,
|
|
3477
3583
|
)
|
|
@@ -3557,7 +3663,8 @@ if response.status_code == 200:
|
|
|
3557
3663
|
keys.append("EMAIL")
|
|
3558
3664
|
if "DATE" in keys:
|
|
3559
3665
|
keys.append("DATETIME")
|
|
3560
|
-
|
|
3666
|
+
autodetected_search_keys = self.autodetected_search_keys or {}
|
|
3667
|
+
search_keys_with_autodetection = {**self.search_keys, **autodetected_search_keys}
|
|
3561
3668
|
return [c for c, v in search_keys_with_autodetection.items() if v.value.value in keys]
|
|
3562
3669
|
|
|
3563
3670
|
def _validate_train_eval(
|
|
@@ -3566,11 +3673,10 @@ if response.status_code == 200:
|
|
|
3566
3673
|
y: pd.Series | None = None,
|
|
3567
3674
|
eval_set: list[tuple[pd.DataFrame, pd.Series]] | None = None,
|
|
3568
3675
|
is_transform: bool = False,
|
|
3569
|
-
silent: bool = False,
|
|
3570
3676
|
) -> tuple[pd.DataFrame, pd.Series, list[tuple[pd.DataFrame, pd.Series]]] | None:
|
|
3571
3677
|
validated_X = self._validate_X(X, is_transform)
|
|
3572
3678
|
validated_y = self._validate_y(validated_X, y, enforce_y=not is_transform)
|
|
3573
|
-
validated_eval_set = self._validate_eval_set(validated_X, eval_set
|
|
3679
|
+
validated_eval_set = self._validate_eval_set(validated_X, eval_set)
|
|
3574
3680
|
return validated_X, validated_y, validated_eval_set
|
|
3575
3681
|
|
|
3576
3682
|
def _encode_id_columns(
|
|
@@ -3696,30 +3802,41 @@ if response.status_code == 200:
|
|
|
3696
3802
|
return validated_y
|
|
3697
3803
|
|
|
3698
3804
|
def _validate_eval_set(
|
|
3699
|
-
self,
|
|
3700
|
-
|
|
3805
|
+
self,
|
|
3806
|
+
X: pd.DataFrame,
|
|
3807
|
+
eval_set: list[tuple[pd.DataFrame, pd.Series]] | None,
|
|
3808
|
+
) -> list[tuple[pd.DataFrame, pd.Series]] | None:
|
|
3701
3809
|
if eval_set is None:
|
|
3702
3810
|
return None
|
|
3703
3811
|
validated_eval_set = []
|
|
3704
|
-
|
|
3705
|
-
for idx, eval_pair in enumerate(eval_set):
|
|
3812
|
+
for _, eval_pair in enumerate(eval_set):
|
|
3706
3813
|
validated_pair = self._validate_eval_set_pair(X, eval_pair)
|
|
3707
|
-
if validated_pair[1].isna().all():
|
|
3708
|
-
if not has_date:
|
|
3709
|
-
msg = self.bundle.get("oot_without_date_not_supported").format(idx + 1)
|
|
3710
|
-
elif self.columns_for_online_api:
|
|
3711
|
-
msg = self.bundle.get("oot_with_online_sources_not_supported").format(idx + 1)
|
|
3712
|
-
else:
|
|
3713
|
-
msg = None
|
|
3714
|
-
if msg:
|
|
3715
|
-
if not silent:
|
|
3716
|
-
print(msg)
|
|
3717
|
-
self.logger.warning(msg)
|
|
3718
|
-
continue
|
|
3719
3814
|
validated_eval_set.append(validated_pair)
|
|
3720
3815
|
|
|
3721
3816
|
return validated_eval_set
|
|
3722
3817
|
|
|
3818
|
+
def _validate_OOT(self, df: pd.DataFrame, search_keys: dict[str, SearchKey]) -> pd.DataFrame:
|
|
3819
|
+
if EVAL_SET_INDEX not in df.columns:
|
|
3820
|
+
return df
|
|
3821
|
+
|
|
3822
|
+
for eval_set_index in df[EVAL_SET_INDEX].unique():
|
|
3823
|
+
if eval_set_index == 0:
|
|
3824
|
+
continue
|
|
3825
|
+
eval_df = df[df[EVAL_SET_INDEX] == eval_set_index]
|
|
3826
|
+
date_col = self._get_date_column(search_keys)
|
|
3827
|
+
has_date = date_col is not None and date_col in eval_df.columns
|
|
3828
|
+
if eval_df[TARGET].isna().all():
|
|
3829
|
+
msg = None
|
|
3830
|
+
if not has_date:
|
|
3831
|
+
msg = self.bundle.get("oot_without_date_not_supported").format(eval_set_index)
|
|
3832
|
+
elif self.columns_for_online_api:
|
|
3833
|
+
msg = self.bundle.get("oot_with_online_sources_not_supported").format(eval_set_index)
|
|
3834
|
+
if msg:
|
|
3835
|
+
print(msg)
|
|
3836
|
+
self.logger.warning(msg)
|
|
3837
|
+
df = df[df[EVAL_SET_INDEX] != eval_set_index]
|
|
3838
|
+
return df
|
|
3839
|
+
|
|
3723
3840
|
def _validate_eval_set_pair(self, X: pd.DataFrame, eval_pair: tuple) -> tuple[pd.DataFrame, pd.Series]:
|
|
3724
3841
|
if len(eval_pair) != 2:
|
|
3725
3842
|
raise ValidationError(self.bundle.get("eval_set_invalid_tuple_size").format(len(eval_pair)))
|
|
@@ -3860,8 +3977,8 @@ if response.status_code == 200:
|
|
|
3860
3977
|
X = Xy.drop(columns=TARGET)
|
|
3861
3978
|
y = Xy[TARGET].copy()
|
|
3862
3979
|
|
|
3863
|
-
if
|
|
3864
|
-
X.drop(columns=
|
|
3980
|
+
if DateTimeConverter.DATETIME_COL in X.columns:
|
|
3981
|
+
X.drop(columns=DateTimeConverter.DATETIME_COL, inplace=True)
|
|
3865
3982
|
|
|
3866
3983
|
return X, y
|
|
3867
3984
|
|
|
@@ -3871,8 +3988,8 @@ if response.status_code == 200:
|
|
|
3871
3988
|
X: pd.DataFrame, y: pd.Series, search_keys: dict[str, SearchKey], cv: CVType | None
|
|
3872
3989
|
) -> tuple[pd.DataFrame, pd.Series]:
|
|
3873
3990
|
if cv not in [CVType.time_series, CVType.blocked_time_series]:
|
|
3874
|
-
if
|
|
3875
|
-
date_column =
|
|
3991
|
+
if DateTimeConverter.DATETIME_COL in X.columns:
|
|
3992
|
+
date_column = DateTimeConverter.DATETIME_COL
|
|
3876
3993
|
else:
|
|
3877
3994
|
date_column = FeaturesEnricher._get_date_column(search_keys)
|
|
3878
3995
|
sort_columns = [date_column] if date_column is not None else []
|
|
@@ -3900,8 +4017,8 @@ if response.status_code == 200:
|
|
|
3900
4017
|
|
|
3901
4018
|
y = Xy[TARGET].copy()
|
|
3902
4019
|
|
|
3903
|
-
if
|
|
3904
|
-
X.drop(columns=
|
|
4020
|
+
if DateTimeConverter.DATETIME_COL in X.columns:
|
|
4021
|
+
X.drop(columns=DateTimeConverter.DATETIME_COL, inplace=True)
|
|
3905
4022
|
|
|
3906
4023
|
return X, y
|
|
3907
4024
|
|
|
@@ -3980,12 +4097,10 @@ if response.status_code == 200:
|
|
|
3980
4097
|
maybe_date_col = SearchKey.find_key(self.search_keys, [SearchKey.DATE, SearchKey.DATETIME])
|
|
3981
4098
|
if X is not None and maybe_date_col is not None and maybe_date_col in X.columns:
|
|
3982
4099
|
# TODO cast date column to single dtype
|
|
3983
|
-
date_converter =
|
|
3984
|
-
|
|
3985
|
-
)
|
|
3986
|
-
|
|
3987
|
-
min_date = converted_X[maybe_date_col].min()
|
|
3988
|
-
max_date = converted_X[maybe_date_col].max()
|
|
4100
|
+
date_converter = DateTimeConverter(maybe_date_col, self.date_format, generate_cyclical_features=False)
|
|
4101
|
+
date_col_values = date_converter.to_date_ms(X)
|
|
4102
|
+
min_date = date_col_values.min()
|
|
4103
|
+
max_date = date_col_values.max()
|
|
3989
4104
|
self.logger.info(f"Dates interval is ({min_date}, {max_date})")
|
|
3990
4105
|
|
|
3991
4106
|
except Exception:
|
|
@@ -4017,12 +4132,14 @@ if response.status_code == 200:
|
|
|
4017
4132
|
or set(search_keys.values()) == {SearchKey.EMAIL}
|
|
4018
4133
|
or set(search_keys.values()) == {SearchKey.HEM}
|
|
4019
4134
|
or set(search_keys.values()) == {SearchKey.COUNTRY, SearchKey.POSTAL_CODE}
|
|
4135
|
+
or len(search_keys) == 0
|
|
4136
|
+
or set(search_keys.values()) == {SearchKey.CUSTOM_KEY}
|
|
4020
4137
|
):
|
|
4021
4138
|
if not silent:
|
|
4022
4139
|
self.__log_warning(bundle.get("current_date_added"))
|
|
4023
|
-
df[
|
|
4024
|
-
search_keys[
|
|
4025
|
-
converter =
|
|
4140
|
+
df[CURRENT_DATE_COL] = datetime.date.today()
|
|
4141
|
+
search_keys[CURRENT_DATE_COL] = SearchKey.DATE
|
|
4142
|
+
converter = DateTimeConverter(CURRENT_DATE_COL, generate_cyclical_features=False)
|
|
4026
4143
|
df = converter.convert(df)
|
|
4027
4144
|
return df
|
|
4028
4145
|
|
|
@@ -4036,7 +4153,7 @@ if response.status_code == 200:
|
|
|
4036
4153
|
return [
|
|
4037
4154
|
col
|
|
4038
4155
|
for col, t in search_keys.items()
|
|
4039
|
-
if t not in [SearchKey.DATE, SearchKey.DATETIME] and df[col].dropna().nunique() > 1
|
|
4156
|
+
if t not in [SearchKey.DATE, SearchKey.DATETIME] and col in df.columns and df[col].dropna().nunique() > 1
|
|
4040
4157
|
]
|
|
4041
4158
|
|
|
4042
4159
|
@staticmethod
|
|
@@ -4153,8 +4270,8 @@ if response.status_code == 200:
|
|
|
4153
4270
|
"__target",
|
|
4154
4271
|
ENTITY_SYSTEM_RECORD_ID,
|
|
4155
4272
|
]
|
|
4156
|
-
if
|
|
4157
|
-
date_column =
|
|
4273
|
+
if DateTimeConverter.DATETIME_COL in df.columns:
|
|
4274
|
+
date_column = DateTimeConverter.DATETIME_COL
|
|
4158
4275
|
sort_exclude_columns.append(FeaturesEnricher._get_date_column(search_keys))
|
|
4159
4276
|
else:
|
|
4160
4277
|
date_column = FeaturesEnricher._get_date_column(search_keys)
|
|
@@ -4335,47 +4452,6 @@ if response.status_code == 200:
|
|
|
4335
4452
|
|
|
4336
4453
|
return result_features
|
|
4337
4454
|
|
|
4338
|
-
def __get_features_importance_from_server(self, trace_id: str, df: pd.DataFrame):
|
|
4339
|
-
if self._search_task is None:
|
|
4340
|
-
raise NotFittedError(self.bundle.get("transform_unfitted_enricher"))
|
|
4341
|
-
features_meta = self._search_task.get_all_features_metadata_v2()
|
|
4342
|
-
if features_meta is None:
|
|
4343
|
-
raise Exception(self.bundle.get("missing_features_meta"))
|
|
4344
|
-
features_meta = deepcopy(features_meta)
|
|
4345
|
-
|
|
4346
|
-
original_names_dict = {c.name: c.originalName for c in self._search_task.get_file_metadata(trace_id).columns}
|
|
4347
|
-
df = df.rename(columns=original_names_dict)
|
|
4348
|
-
|
|
4349
|
-
features_meta.sort(key=lambda m: (-m.shap_value, m.name))
|
|
4350
|
-
|
|
4351
|
-
importances = {}
|
|
4352
|
-
|
|
4353
|
-
for feature_meta in features_meta:
|
|
4354
|
-
if feature_meta.name in original_names_dict.keys():
|
|
4355
|
-
feature_meta.name = original_names_dict[feature_meta.name]
|
|
4356
|
-
|
|
4357
|
-
is_client_feature = feature_meta.name in df.columns
|
|
4358
|
-
|
|
4359
|
-
if feature_meta.shap_value == 0.0:
|
|
4360
|
-
continue
|
|
4361
|
-
|
|
4362
|
-
# Use only important features
|
|
4363
|
-
if (
|
|
4364
|
-
feature_meta.name == COUNTRY
|
|
4365
|
-
# In select_features mode we select also from etalon features and need to show them
|
|
4366
|
-
or (not self.fit_select_features and is_client_feature)
|
|
4367
|
-
):
|
|
4368
|
-
continue
|
|
4369
|
-
|
|
4370
|
-
# Temporary workaround for duplicate features metadata
|
|
4371
|
-
if feature_meta.name in importances:
|
|
4372
|
-
self.logger.warning(f"WARNING: Duplicate feature metadata: {feature_meta}")
|
|
4373
|
-
continue
|
|
4374
|
-
|
|
4375
|
-
importances[feature_meta.name] = feature_meta.shap_value
|
|
4376
|
-
|
|
4377
|
-
return importances
|
|
4378
|
-
|
|
4379
4455
|
def __get_categorical_features(self) -> list[str]:
|
|
4380
4456
|
features_meta = self._search_task.get_all_features_metadata_v2()
|
|
4381
4457
|
if features_meta is None:
|
|
@@ -4385,7 +4461,6 @@ if response.status_code == 200:
|
|
|
4385
4461
|
|
|
4386
4462
|
def __prepare_feature_importances(
|
|
4387
4463
|
self,
|
|
4388
|
-
trace_id: str,
|
|
4389
4464
|
clients_features_df: pd.DataFrame,
|
|
4390
4465
|
updated_shaps: dict[str, float] | None = None,
|
|
4391
4466
|
update_selected_features: bool = True,
|
|
@@ -4393,14 +4468,16 @@ if response.status_code == 200:
|
|
|
4393
4468
|
):
|
|
4394
4469
|
if self._search_task is None:
|
|
4395
4470
|
raise NotFittedError(self.bundle.get("transform_unfitted_enricher"))
|
|
4396
|
-
selected_features = self._search_task.get_selected_features(
|
|
4471
|
+
selected_features = self._search_task.get_selected_features(self._get_trace_id())
|
|
4397
4472
|
features_meta = self._search_task.get_all_features_metadata_v2()
|
|
4398
4473
|
if features_meta is None:
|
|
4399
4474
|
raise Exception(self.bundle.get("missing_features_meta"))
|
|
4400
4475
|
features_meta = deepcopy(features_meta)
|
|
4401
4476
|
|
|
4402
|
-
|
|
4403
|
-
|
|
4477
|
+
file_metadata_columns = self._search_task.get_file_metadata(self._get_trace_id()).columns
|
|
4478
|
+
file_meta_by_orig_name = {c.originalName: c for c in file_metadata_columns}
|
|
4479
|
+
original_names_dict = {c.name: c.originalName for c in file_metadata_columns}
|
|
4480
|
+
features_df = self._search_task.get_all_initial_raw_features(self._get_trace_id(), metrics_calculation=True)
|
|
4404
4481
|
|
|
4405
4482
|
# To be sure that names with hash suffixes
|
|
4406
4483
|
clients_features_df = clients_features_df.rename(columns=original_names_dict)
|
|
@@ -4419,10 +4496,13 @@ if response.status_code == 200:
|
|
|
4419
4496
|
original_name = original_names_dict.get(feature_meta.name, feature_meta.name)
|
|
4420
4497
|
feature_meta.name = original_name
|
|
4421
4498
|
|
|
4422
|
-
|
|
4499
|
+
file_meta = file_meta_by_orig_name.get(original_name)
|
|
4500
|
+
is_generated_feature = (
|
|
4501
|
+
file_meta is not None and file_meta.meaningType == FileColumnMeaningType.GENERATED_FEATURE
|
|
4502
|
+
)
|
|
4503
|
+
is_client_feature = original_name in clients_features_df.columns and not is_generated_feature
|
|
4423
4504
|
|
|
4424
4505
|
if selected_features is not None and feature_meta.name not in selected_features:
|
|
4425
|
-
self.logger.info(f"Feature {feature_meta.name} is not selected before and skipped")
|
|
4426
4506
|
continue
|
|
4427
4507
|
|
|
4428
4508
|
selected_features_meta.append(feature_meta)
|
|
@@ -4442,9 +4522,13 @@ if response.status_code == 200:
|
|
|
4442
4522
|
|
|
4443
4523
|
for feature_meta in selected_features_meta:
|
|
4444
4524
|
original_name = original_names_dict.get(feature_meta.name, feature_meta.name)
|
|
4445
|
-
|
|
4525
|
+
file_meta = file_meta_by_orig_name.get(original_name)
|
|
4526
|
+
is_generated_feature = (
|
|
4527
|
+
file_meta is not None and file_meta.meaningType == FileColumnMeaningType.GENERATED_FEATURE
|
|
4528
|
+
)
|
|
4529
|
+
is_client_feature = original_name in clients_features_df.columns and not is_generated_feature
|
|
4446
4530
|
|
|
4447
|
-
if not is_client_feature:
|
|
4531
|
+
if not is_client_feature and not is_generated_feature:
|
|
4448
4532
|
self.external_source_feature_names.append(original_name)
|
|
4449
4533
|
|
|
4450
4534
|
if self.psi_values is not None:
|
|
@@ -4475,20 +4559,21 @@ if response.status_code == 200:
|
|
|
4475
4559
|
|
|
4476
4560
|
self.feature_names_.append(feature_meta.name)
|
|
4477
4561
|
self.feature_importances_.append(_round_shap_value(feature_meta.shap_value))
|
|
4478
|
-
|
|
4479
4562
|
df_for_sample = features_df if feature_meta.name in features_df.columns else clients_features_df
|
|
4480
|
-
feature_info = FeatureInfo.from_metadata(
|
|
4563
|
+
feature_info = FeatureInfo.from_metadata(
|
|
4564
|
+
feature_meta, df_for_sample, is_client_feature, is_generated_feature
|
|
4565
|
+
)
|
|
4481
4566
|
features_info.append(feature_info.to_row(self.bundle))
|
|
4482
4567
|
features_info_without_links.append(feature_info.to_row_without_links(self.bundle))
|
|
4483
4568
|
internal_features_info.append(feature_info.to_internal_row(self.bundle))
|
|
4484
4569
|
|
|
4485
4570
|
if update_selected_features:
|
|
4486
|
-
self._search_task.update_selected_features(
|
|
4571
|
+
self._search_task.update_selected_features(self._get_trace_id(), self.feature_names_)
|
|
4487
4572
|
|
|
4488
4573
|
if len(features_info) > 0:
|
|
4489
4574
|
self.features_info = pd.DataFrame(features_info)
|
|
4490
4575
|
# If all psi values are 0 or null, drop psi column
|
|
4491
|
-
if self.features_info[self.bundle.get("features_info_psi")].fillna(0.0).eq(0.0).all():
|
|
4576
|
+
if self.features_info[self.bundle.get("features_info_psi")].astype(np.float64).fillna(0.0).eq(0.0).all():
|
|
4492
4577
|
self.features_info.drop(columns=[self.bundle.get("features_info_psi")], inplace=True)
|
|
4493
4578
|
self._features_info_without_links = pd.DataFrame(features_info_without_links)
|
|
4494
4579
|
self._internal_features_info = pd.DataFrame(internal_features_info)
|
|
@@ -4681,12 +4766,17 @@ if response.status_code == 200:
|
|
|
4681
4766
|
):
|
|
4682
4767
|
raise ValidationError(self.bundle.get("empty_search_key").format(column_name))
|
|
4683
4768
|
|
|
4684
|
-
if
|
|
4685
|
-
|
|
4686
|
-
|
|
4687
|
-
|
|
4688
|
-
|
|
4689
|
-
|
|
4769
|
+
if is_transform:
|
|
4770
|
+
fit_autodetected_search_keys = self._get_autodetected_search_keys()
|
|
4771
|
+
if fit_autodetected_search_keys is not None:
|
|
4772
|
+
for key in fit_autodetected_search_keys.keys():
|
|
4773
|
+
if key not in x.columns:
|
|
4774
|
+
raise ValidationError(
|
|
4775
|
+
self.bundle.get("autodetected_search_key_not_found").format(key, x.columns)
|
|
4776
|
+
)
|
|
4777
|
+
valid_search_keys.update(fit_autodetected_search_keys)
|
|
4778
|
+
elif self.autodetect_search_keys:
|
|
4779
|
+
valid_search_keys = self.__detect_missing_search_keys(x, valid_search_keys, is_demo_dataset)
|
|
4690
4780
|
|
|
4691
4781
|
if all(k == SearchKey.CUSTOM_KEY for k in valid_search_keys.values()):
|
|
4692
4782
|
if self.__is_registered:
|
|
@@ -4694,7 +4784,8 @@ if response.status_code == 200:
|
|
|
4694
4784
|
else:
|
|
4695
4785
|
msg = self.bundle.get("unregistered_only_personal_keys")
|
|
4696
4786
|
self.logger.warning(msg + f" Provided search keys: {search_keys}")
|
|
4697
|
-
|
|
4787
|
+
# Current date will be added later
|
|
4788
|
+
# raise ValidationError(msg)
|
|
4698
4789
|
|
|
4699
4790
|
if (
|
|
4700
4791
|
len(valid_search_keys.values()) == 1
|
|
@@ -4708,7 +4799,7 @@ if response.status_code == 200:
|
|
|
4708
4799
|
maybe_date = [k for k, v in valid_search_keys.items() if v in [SearchKey.DATE, SearchKey.DATETIME]]
|
|
4709
4800
|
if (self.cv is None or self.cv == CVType.k_fold) and len(maybe_date) > 0 and not silent_mode:
|
|
4710
4801
|
date_column = next(iter(maybe_date))
|
|
4711
|
-
if x[date_column].nunique() > 0.9 * _num_samples(x):
|
|
4802
|
+
if x[date_column].nunique() > 0.9 * _num_samples(x) and not is_transform:
|
|
4712
4803
|
msg = self.bundle.get("date_search_without_time_series")
|
|
4713
4804
|
self.__log_warning(msg)
|
|
4714
4805
|
|
|
@@ -4723,6 +4814,8 @@ if response.status_code == 200:
|
|
|
4723
4814
|
|
|
4724
4815
|
self.logger.info(f"Prepared search keys: {valid_search_keys}")
|
|
4725
4816
|
|
|
4817
|
+
# x = self._validate_empty_search_keys(x, valid_search_keys, is_transform=is_transform)
|
|
4818
|
+
|
|
4726
4819
|
return valid_search_keys
|
|
4727
4820
|
|
|
4728
4821
|
def __show_metrics(
|
|
@@ -4730,7 +4823,6 @@ if response.status_code == 200:
|
|
|
4730
4823
|
scoring: Callable | str | None,
|
|
4731
4824
|
estimator: Any | None,
|
|
4732
4825
|
remove_outliers_calc_metrics: bool | None,
|
|
4733
|
-
trace_id: str,
|
|
4734
4826
|
progress_bar: ProgressBar | None = None,
|
|
4735
4827
|
progress_callback: Callable[[SearchProgress], Any] | None = None,
|
|
4736
4828
|
):
|
|
@@ -4738,7 +4830,6 @@ if response.status_code == 200:
|
|
|
4738
4830
|
scoring=scoring,
|
|
4739
4831
|
estimator=estimator,
|
|
4740
4832
|
remove_outliers_calc_metrics=remove_outliers_calc_metrics,
|
|
4741
|
-
trace_id=trace_id,
|
|
4742
4833
|
internal_call=True,
|
|
4743
4834
|
progress_bar=progress_bar,
|
|
4744
4835
|
progress_callback=progress_callback,
|
|
@@ -4803,80 +4894,67 @@ if response.status_code == 200:
|
|
|
4803
4894
|
df: pd.DataFrame,
|
|
4804
4895
|
search_keys: dict[str, SearchKey],
|
|
4805
4896
|
is_demo_dataset: bool,
|
|
4806
|
-
silent_mode=False,
|
|
4807
|
-
is_transform=False,
|
|
4808
4897
|
) -> dict[str, SearchKey]:
|
|
4809
4898
|
sample = df.head(100)
|
|
4810
4899
|
|
|
4811
|
-
|
|
4812
|
-
|
|
4813
|
-
|
|
4814
|
-
|
|
4815
|
-
|
|
4816
|
-
# if SearchKey.POSTAL_CODE not in search_keys.values() and check_need_detect(SearchKey.POSTAL_CODE):
|
|
4817
|
-
if check_need_detect(SearchKey.POSTAL_CODE):
|
|
4818
|
-
maybe_keys = PostalCodeSearchKeyDetector().get_search_key_columns(sample, search_keys)
|
|
4819
|
-
if maybe_keys:
|
|
4820
|
-
new_keys = {key: SearchKey.POSTAL_CODE for key in maybe_keys}
|
|
4900
|
+
if SearchKey.DATE not in search_keys.values() and SearchKey.DATETIME not in search_keys.values():
|
|
4901
|
+
maybe_keys = DateSearchKeyDetector().get_search_key_columns(sample, search_keys)
|
|
4902
|
+
if len(maybe_keys) > 0:
|
|
4903
|
+
datetime_key = maybe_keys[0]
|
|
4904
|
+
new_keys = {datetime_key: SearchKey.DATETIME}
|
|
4821
4905
|
search_keys.update(new_keys)
|
|
4822
|
-
self.
|
|
4823
|
-
self.logger.info(f"Autodetected search key
|
|
4824
|
-
|
|
4825
|
-
print(self.bundle.get("postal_code_detected").format(maybe_keys))
|
|
4906
|
+
self._add_autodetected_search_keys(new_keys)
|
|
4907
|
+
self.logger.info(f"Autodetected search key DATETIME in column {datetime_key}")
|
|
4908
|
+
print(self.bundle.get("datetime_detected").format(datetime_key))
|
|
4826
4909
|
|
|
4827
|
-
if (
|
|
4828
|
-
|
|
4829
|
-
|
|
4830
|
-
|
|
4831
|
-
|
|
4910
|
+
# if SearchKey.POSTAL_CODE not in search_keys.values() and check_need_detect(SearchKey.POSTAL_CODE):
|
|
4911
|
+
maybe_keys = PostalCodeSearchKeyDetector().get_search_key_columns(sample, search_keys)
|
|
4912
|
+
if maybe_keys:
|
|
4913
|
+
new_keys = {key: SearchKey.POSTAL_CODE for key in maybe_keys}
|
|
4914
|
+
search_keys.update(new_keys)
|
|
4915
|
+
self._add_autodetected_search_keys(new_keys)
|
|
4916
|
+
self.logger.info(f"Autodetected search key POSTAL_CODE in column {maybe_keys}")
|
|
4917
|
+
print(self.bundle.get("postal_code_detected").format(maybe_keys))
|
|
4918
|
+
|
|
4919
|
+
if SearchKey.COUNTRY not in search_keys.values() and self.country_code is None:
|
|
4832
4920
|
maybe_key = CountrySearchKeyDetector().get_search_key_columns(sample, search_keys)
|
|
4833
4921
|
if maybe_key:
|
|
4834
|
-
|
|
4835
|
-
|
|
4922
|
+
new_keys = {maybe_key[0]: SearchKey.COUNTRY}
|
|
4923
|
+
search_keys.update(new_keys)
|
|
4924
|
+
self._add_autodetected_search_keys(new_keys)
|
|
4836
4925
|
self.logger.info(f"Autodetected search key COUNTRY in column {maybe_key}")
|
|
4837
|
-
|
|
4838
|
-
print(self.bundle.get("country_detected").format(maybe_key))
|
|
4926
|
+
print(self.bundle.get("country_detected").format(maybe_key))
|
|
4839
4927
|
|
|
4840
|
-
if (
|
|
4841
|
-
# SearchKey.EMAIL not in search_keys.values()
|
|
4842
|
-
SearchKey.HEM not in search_keys.values()
|
|
4843
|
-
and check_need_detect(SearchKey.HEM)
|
|
4844
|
-
):
|
|
4928
|
+
if SearchKey.EMAIL not in search_keys.values() and SearchKey.HEM not in search_keys.values():
|
|
4845
4929
|
maybe_keys = EmailSearchKeyDetector().get_search_key_columns(sample, search_keys)
|
|
4846
4930
|
if maybe_keys:
|
|
4847
4931
|
if self.__is_registered or is_demo_dataset:
|
|
4848
4932
|
new_keys = {key: SearchKey.EMAIL for key in maybe_keys}
|
|
4849
4933
|
search_keys.update(new_keys)
|
|
4850
|
-
self.
|
|
4934
|
+
self._add_autodetected_search_keys(new_keys)
|
|
4851
4935
|
self.logger.info(f"Autodetected search key EMAIL in column {maybe_keys}")
|
|
4852
|
-
|
|
4853
|
-
print(self.bundle.get("email_detected").format(maybe_keys))
|
|
4936
|
+
print(self.bundle.get("email_detected").format(maybe_keys))
|
|
4854
4937
|
else:
|
|
4855
4938
|
self.logger.warning(
|
|
4856
4939
|
f"Autodetected search key EMAIL in column {maybe_keys}."
|
|
4857
4940
|
" But not used because not registered user"
|
|
4858
4941
|
)
|
|
4859
|
-
|
|
4860
|
-
self.__log_warning(self.bundle.get("email_detected_not_registered").format(maybe_keys))
|
|
4942
|
+
self.__log_warning(self.bundle.get("email_detected_not_registered").format(maybe_keys))
|
|
4861
4943
|
|
|
4862
4944
|
# if SearchKey.PHONE not in search_keys.values() and check_need_detect(SearchKey.PHONE):
|
|
4863
|
-
|
|
4864
|
-
|
|
4865
|
-
if
|
|
4866
|
-
|
|
4867
|
-
|
|
4868
|
-
|
|
4869
|
-
|
|
4870
|
-
|
|
4871
|
-
|
|
4872
|
-
|
|
4873
|
-
|
|
4874
|
-
|
|
4875
|
-
|
|
4876
|
-
"But not used because not registered user"
|
|
4877
|
-
)
|
|
4878
|
-
if not silent_mode:
|
|
4879
|
-
self.__log_warning(self.bundle.get("phone_detected_not_registered"))
|
|
4945
|
+
maybe_keys = PhoneSearchKeyDetector().get_search_key_columns(sample, search_keys)
|
|
4946
|
+
if maybe_keys:
|
|
4947
|
+
if self.__is_registered or is_demo_dataset:
|
|
4948
|
+
new_keys = {key: SearchKey.PHONE for key in maybe_keys}
|
|
4949
|
+
search_keys.update(new_keys)
|
|
4950
|
+
self._add_autodetected_search_keys(new_keys)
|
|
4951
|
+
self.logger.info(f"Autodetected search key PHONE in column {maybe_keys}")
|
|
4952
|
+
print(self.bundle.get("phone_detected").format(maybe_keys))
|
|
4953
|
+
else:
|
|
4954
|
+
self.logger.warning(
|
|
4955
|
+
f"Autodetected search key PHONE in column {maybe_keys}. " "But not used because not registered user"
|
|
4956
|
+
)
|
|
4957
|
+
self.__log_warning(self.bundle.get("phone_detected_not_registered"))
|
|
4880
4958
|
|
|
4881
4959
|
return search_keys
|
|
4882
4960
|
|
|
@@ -4948,13 +5026,12 @@ if response.status_code == 200:
|
|
|
4948
5026
|
|
|
4949
5027
|
def dump_input(
|
|
4950
5028
|
self,
|
|
4951
|
-
trace_id: str,
|
|
4952
5029
|
X: pd.DataFrame | pd.Series,
|
|
4953
5030
|
y: pd.DataFrame | pd.Series | None = None,
|
|
4954
5031
|
eval_set: tuple | None = None,
|
|
4955
5032
|
):
|
|
4956
|
-
def dump_task(X_, y_, eval_set_):
|
|
4957
|
-
with MDC(
|
|
5033
|
+
def dump_task(X_, y_, eval_set_, trace_id_):
|
|
5034
|
+
with MDC(correlation_id=trace_id_):
|
|
4958
5035
|
try:
|
|
4959
5036
|
if isinstance(X_, pd.Series):
|
|
4960
5037
|
X_ = X_.to_frame()
|
|
@@ -4962,13 +5039,13 @@ if response.status_code == 200:
|
|
|
4962
5039
|
with tempfile.TemporaryDirectory() as tmp_dir:
|
|
4963
5040
|
X_.to_parquet(f"{tmp_dir}/x.parquet", compression="zstd")
|
|
4964
5041
|
x_digest_sha256 = file_hash(f"{tmp_dir}/x.parquet")
|
|
4965
|
-
if self.rest_client.is_file_uploaded(
|
|
5042
|
+
if self.rest_client.is_file_uploaded(trace_id_, x_digest_sha256):
|
|
4966
5043
|
self.logger.info(
|
|
4967
5044
|
f"File x.parquet was already uploaded with digest {x_digest_sha256}, skipping"
|
|
4968
5045
|
)
|
|
4969
5046
|
else:
|
|
4970
5047
|
self.rest_client.dump_input_file(
|
|
4971
|
-
|
|
5048
|
+
trace_id_, f"{tmp_dir}/x.parquet", "x.parquet", x_digest_sha256
|
|
4972
5049
|
)
|
|
4973
5050
|
|
|
4974
5051
|
if y_ is not None:
|
|
@@ -4976,13 +5053,13 @@ if response.status_code == 200:
|
|
|
4976
5053
|
y_ = y_.to_frame()
|
|
4977
5054
|
y_.to_parquet(f"{tmp_dir}/y.parquet", compression="zstd")
|
|
4978
5055
|
y_digest_sha256 = file_hash(f"{tmp_dir}/y.parquet")
|
|
4979
|
-
if self.rest_client.is_file_uploaded(
|
|
5056
|
+
if self.rest_client.is_file_uploaded(trace_id_, y_digest_sha256):
|
|
4980
5057
|
self.logger.info(
|
|
4981
5058
|
f"File y.parquet was already uploaded with digest {y_digest_sha256}, skipping"
|
|
4982
5059
|
)
|
|
4983
5060
|
else:
|
|
4984
5061
|
self.rest_client.dump_input_file(
|
|
4985
|
-
|
|
5062
|
+
trace_id_, f"{tmp_dir}/y.parquet", "y.parquet", y_digest_sha256
|
|
4986
5063
|
)
|
|
4987
5064
|
|
|
4988
5065
|
if eval_set_ is not None and len(eval_set_) > 0:
|
|
@@ -4991,14 +5068,14 @@ if response.status_code == 200:
|
|
|
4991
5068
|
eval_x_ = eval_x_.to_frame()
|
|
4992
5069
|
eval_x_.to_parquet(f"{tmp_dir}/eval_x_{idx}.parquet", compression="zstd")
|
|
4993
5070
|
eval_x_digest_sha256 = file_hash(f"{tmp_dir}/eval_x_{idx}.parquet")
|
|
4994
|
-
if self.rest_client.is_file_uploaded(
|
|
5071
|
+
if self.rest_client.is_file_uploaded(trace_id_, eval_x_digest_sha256):
|
|
4995
5072
|
self.logger.info(
|
|
4996
5073
|
f"File eval_x_{idx}.parquet was already uploaded with"
|
|
4997
5074
|
f" digest {eval_x_digest_sha256}, skipping"
|
|
4998
5075
|
)
|
|
4999
5076
|
else:
|
|
5000
5077
|
self.rest_client.dump_input_file(
|
|
5001
|
-
|
|
5078
|
+
trace_id_,
|
|
5002
5079
|
f"{tmp_dir}/eval_x_{idx}.parquet",
|
|
5003
5080
|
f"eval_x_{idx}.parquet",
|
|
5004
5081
|
eval_x_digest_sha256,
|
|
@@ -5008,14 +5085,14 @@ if response.status_code == 200:
|
|
|
5008
5085
|
eval_y_ = eval_y_.to_frame()
|
|
5009
5086
|
eval_y_.to_parquet(f"{tmp_dir}/eval_y_{idx}.parquet", compression="zstd")
|
|
5010
5087
|
eval_y_digest_sha256 = file_hash(f"{tmp_dir}/eval_y_{idx}.parquet")
|
|
5011
|
-
if self.rest_client.is_file_uploaded(
|
|
5088
|
+
if self.rest_client.is_file_uploaded(trace_id_, eval_y_digest_sha256):
|
|
5012
5089
|
self.logger.info(
|
|
5013
5090
|
f"File eval_y_{idx}.parquet was already uploaded"
|
|
5014
5091
|
f" with digest {eval_y_digest_sha256}, skipping"
|
|
5015
5092
|
)
|
|
5016
5093
|
else:
|
|
5017
5094
|
self.rest_client.dump_input_file(
|
|
5018
|
-
|
|
5095
|
+
trace_id_,
|
|
5019
5096
|
f"{tmp_dir}/eval_y_{idx}.parquet",
|
|
5020
5097
|
f"eval_y_{idx}.parquet",
|
|
5021
5098
|
eval_y_digest_sha256,
|
|
@@ -5024,7 +5101,8 @@ if response.status_code == 200:
|
|
|
5024
5101
|
self.logger.warning("Failed to dump input files", exc_info=True)
|
|
5025
5102
|
|
|
5026
5103
|
try:
|
|
5027
|
-
|
|
5104
|
+
trace_id = self._get_trace_id()
|
|
5105
|
+
Thread(target=dump_task, args=(X, y, eval_set, trace_id), daemon=True).start()
|
|
5028
5106
|
except Exception:
|
|
5029
5107
|
self.logger.warning("Failed to dump input files", exc_info=True)
|
|
5030
5108
|
|