upgini 1.2.113a3974.dev2__py3-none-any.whl → 1.2.114__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- upgini/__about__.py +1 -1
- upgini/dataset.py +48 -78
- upgini/features_enricher.py +726 -516
- upgini/http.py +15 -19
- upgini/metadata.py +1 -10
- upgini/metrics.py +6 -2
- upgini/resource_bundle/strings.properties +8 -6
- upgini/sampler/base.py +3 -1
- upgini/sampler/random_under_sampler.py +18 -8
- upgini/search_task.py +6 -0
- upgini/utils/config.py +43 -0
- upgini/utils/deduplicate_utils.py +57 -9
- upgini/utils/display_utils.py +1 -1
- upgini/utils/feature_info.py +5 -0
- upgini/utils/hash_utils.py +159 -0
- upgini/utils/psi.py +300 -0
- upgini/utils/sample_utils.py +45 -42
- upgini/utils/target_utils.py +53 -2
- {upgini-1.2.113a3974.dev2.dist-info → upgini-1.2.114.dist-info}/METADATA +62 -32
- {upgini-1.2.113a3974.dev2.dist-info → upgini-1.2.114.dist-info}/RECORD +22 -19
- {upgini-1.2.113a3974.dev2.dist-info → upgini-1.2.114.dist-info}/WHEEL +1 -1
- {upgini-1.2.113a3974.dev2.dist-info → upgini-1.2.114.dist-info}/licenses/LICENSE +0 -0
upgini/features_enricher.py
CHANGED
@@ -1,8 +1,6 @@
|
|
1
1
|
import dataclasses
|
2
2
|
import datetime
|
3
3
|
import gc
|
4
|
-
import hashlib
|
5
|
-
import itertools
|
6
4
|
import json
|
7
5
|
import logging
|
8
6
|
import os
|
@@ -14,7 +12,7 @@ from collections import Counter
|
|
14
12
|
from copy import deepcopy
|
15
13
|
from dataclasses import dataclass
|
16
14
|
from threading import Thread
|
17
|
-
from typing import Any, Callable,
|
15
|
+
from typing import Any, Callable, Optional, Set, Union
|
18
16
|
|
19
17
|
import numpy as np
|
20
18
|
import pandas as pd
|
@@ -101,6 +99,7 @@ from upgini.utils.email_utils import (
|
|
101
99
|
from upgini.utils.feature_info import FeatureInfo, _round_shap_value
|
102
100
|
from upgini.utils.features_validator import FeaturesValidator
|
103
101
|
from upgini.utils.format import Format
|
102
|
+
from upgini.utils.hash_utils import file_hash, hash_input
|
104
103
|
from upgini.utils.ip_utils import IpSearchKeyConverter
|
105
104
|
from upgini.utils.phone_utils import PhoneSearchKeyDetector
|
106
105
|
from upgini.utils.postal_code_utils import PostalCodeSearchKeyDetector
|
@@ -112,9 +111,11 @@ except Exception:
|
|
112
111
|
CustomFallbackProgressBar as ProgressBar,
|
113
112
|
)
|
114
113
|
|
115
|
-
from upgini.utils.
|
114
|
+
from upgini.utils.config import SampleConfig
|
115
|
+
from upgini.utils.psi import calculate_features_psi, calculate_sparsity_psi
|
116
|
+
from upgini.utils.sample_utils import SampleColumns, _num_samples, sample
|
116
117
|
from upgini.utils.sort import sort_columns
|
117
|
-
from upgini.utils.target_utils import calculate_psi, define_task
|
118
|
+
from upgini.utils.target_utils import calculate_psi, define_task, is_imbalanced
|
118
119
|
from upgini.utils.warning_counter import WarningCounter
|
119
120
|
from upgini.version_validator import validate_version
|
120
121
|
|
@@ -130,7 +131,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
130
131
|
Parameters
|
131
132
|
----------
|
132
133
|
search_keys: dict of str->SearchKey or int->SearchKey
|
133
|
-
|
134
|
+
dictionary with column names or indices mapping to key types.
|
134
135
|
Each of this columns will be used as a search key to find features.
|
135
136
|
|
136
137
|
country_code: str, optional (default=None)
|
@@ -162,7 +163,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
162
163
|
Custom loss function to use for feature selection and metrics calculation.
|
163
164
|
|
164
165
|
shared_datasets: list of str, optional (default=None)
|
165
|
-
|
166
|
+
list of private shared dataset ids for custom search
|
166
167
|
"""
|
167
168
|
|
168
169
|
TARGET_NAME = "target"
|
@@ -206,32 +207,32 @@ class FeaturesEnricher(TransformerMixin):
|
|
206
207
|
|
207
208
|
def __init__(
|
208
209
|
self,
|
209
|
-
search_keys: Optional[
|
210
|
+
search_keys: Optional[dict[str, SearchKey]] = None,
|
210
211
|
country_code: Optional[str] = None,
|
211
212
|
model_task_type: Optional[Union[ModelTaskType, str]] = None,
|
212
213
|
api_key: Optional[str] = None,
|
213
214
|
endpoint: Optional[str] = None,
|
214
215
|
search_id: Optional[str] = None,
|
215
|
-
shared_datasets: Optional[
|
216
|
+
shared_datasets: Optional[list[str]] = None,
|
216
217
|
runtime_parameters: Optional[RuntimeParameters] = None,
|
217
218
|
date_format: Optional[str] = None,
|
218
219
|
random_state: int = 42,
|
219
220
|
cv: Optional[CVType] = None,
|
220
221
|
loss: Optional[str] = None,
|
221
222
|
autodetect_search_keys: bool = True,
|
222
|
-
generate_features: Optional[
|
223
|
-
columns_for_online_api: Optional[
|
223
|
+
generate_features: Optional[list[str]] = None,
|
224
|
+
columns_for_online_api: Optional[list[str]] = None,
|
224
225
|
round_embeddings: Optional[int] = None,
|
225
226
|
logs_enabled: bool = True,
|
226
227
|
raise_validation_error: bool = True,
|
227
|
-
exclude_columns: Optional[
|
228
|
+
exclude_columns: Optional[list[str]] = None,
|
228
229
|
baseline_score_column: Optional[Any] = None,
|
229
230
|
client_ip: Optional[str] = None,
|
230
231
|
client_visitorid: Optional[str] = None,
|
231
232
|
custom_bundle_config: Optional[str] = None,
|
232
233
|
add_date_if_missing: bool = True,
|
233
234
|
disable_force_downsampling: bool = False,
|
234
|
-
id_columns: Optional[
|
235
|
+
id_columns: Optional[list[str]] = None,
|
235
236
|
generate_search_key_features: bool = True,
|
236
237
|
sample_config: Optional[SampleConfig] = None,
|
237
238
|
print_trace_id: bool = False,
|
@@ -257,21 +258,21 @@ class FeaturesEnricher(TransformerMixin):
|
|
257
258
|
self.logger.warning(msg)
|
258
259
|
print(msg)
|
259
260
|
|
260
|
-
self.passed_features:
|
261
|
+
self.passed_features: list[str] = []
|
261
262
|
self.df_with_original_index: Optional[pd.DataFrame] = None
|
262
|
-
self.fit_columns_renaming: Optional[
|
263
|
+
self.fit_columns_renaming: Optional[dict[str, str]] = None
|
263
264
|
self.country_added = False
|
264
|
-
self.fit_generated_features:
|
265
|
+
self.fit_generated_features: list[str] = []
|
265
266
|
self.fit_dropped_features: Set[str] = set()
|
266
267
|
self.fit_search_keys = search_keys
|
267
268
|
self.warning_counter = WarningCounter()
|
268
269
|
self.X: Optional[pd.DataFrame] = None
|
269
270
|
self.y: Optional[pd.Series] = None
|
270
|
-
self.eval_set: Optional[
|
271
|
-
self.autodetected_search_keys:
|
271
|
+
self.eval_set: Optional[list[tuple]] = None
|
272
|
+
self.autodetected_search_keys: dict[str, SearchKey] = {}
|
272
273
|
self.imbalanced = False
|
273
274
|
self.fit_select_features = True
|
274
|
-
self.__cached_sampled_datasets:
|
275
|
+
self.__cached_sampled_datasets: dict[str, tuple[pd.DataFrame, pd.DataFrame, pd.Series, dict, dict, dict]] = (
|
275
276
|
dict()
|
276
277
|
)
|
277
278
|
|
@@ -296,8 +297,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
296
297
|
self.metrics: Optional[pd.DataFrame] = None
|
297
298
|
self.feature_names_ = []
|
298
299
|
self.external_source_feature_names = []
|
299
|
-
self.zero_shap_client_features = []
|
300
300
|
self.feature_importances_ = []
|
301
|
+
self.psi_values: Optional[dict[str, float]] = None
|
301
302
|
self.search_id = search_id
|
302
303
|
self.disable_force_downsampling = disable_force_downsampling
|
303
304
|
self.print_trace_id = print_trace_id
|
@@ -317,7 +318,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
317
318
|
x_columns = [c.name for c in file_metadata.columns]
|
318
319
|
self.fit_columns_renaming = {c.name: c.originalName for c in file_metadata.columns}
|
319
320
|
df = pd.DataFrame(columns=x_columns)
|
320
|
-
self.__prepare_feature_importances(trace_id, df, silent=True)
|
321
|
+
self.__prepare_feature_importances(trace_id, df, silent=True, update_selected_features=False)
|
322
|
+
self.__show_selected_features()
|
321
323
|
# TODO validate search_keys with search_keys from file_metadata
|
322
324
|
print(self.bundle.get("search_by_task_id_finish"))
|
323
325
|
self.logger.debug(f"Successfully initialized with search_id: {search_id}")
|
@@ -395,37 +397,54 @@ class FeaturesEnricher(TransformerMixin):
|
|
395
397
|
|
396
398
|
api_key = property(_get_api_key, _set_api_key)
|
397
399
|
|
398
|
-
|
399
|
-
def _check_eval_set(eval_set, X, bundle: ResourceBundle):
|
400
|
+
def _check_eval_set(self, eval_set, X):
|
400
401
|
checked_eval_set = []
|
401
|
-
if eval_set is
|
402
|
+
if eval_set is None:
|
403
|
+
return checked_eval_set
|
404
|
+
if isinstance(eval_set, tuple):
|
402
405
|
eval_set = [eval_set]
|
403
|
-
if
|
404
|
-
raise ValidationError(bundle.get("unsupported_type_eval_set").format(type(eval_set)))
|
405
|
-
for eval_pair in eval_set or []:
|
406
|
+
if not isinstance(eval_set, list):
|
407
|
+
raise ValidationError(self.bundle.get("unsupported_type_eval_set").format(type(eval_set)))
|
408
|
+
for i, eval_pair in enumerate(eval_set or [], 1):
|
409
|
+
# Handle OOT
|
410
|
+
if isinstance(eval_pair, pd.DataFrame):
|
411
|
+
empty_target = pd.Series([np.nan] * len(eval_pair), index=eval_pair.index)
|
412
|
+
eval_pair = (eval_pair, empty_target)
|
413
|
+
elif isinstance(eval_pair, tuple) and len(eval_pair) == 1:
|
414
|
+
empty_target = pd.Series([np.nan] * len(eval_pair[0]), index=eval_pair[0].index)
|
415
|
+
eval_pair = (eval_pair[0], empty_target)
|
416
|
+
|
406
417
|
if not isinstance(eval_pair, tuple) or len(eval_pair) != 2:
|
407
|
-
raise ValidationError(bundle.get("eval_set_invalid_tuple_size").format(len(eval_pair)))
|
408
|
-
if
|
418
|
+
raise ValidationError(self.bundle.get("eval_set_invalid_tuple_size").format(len(eval_pair)))
|
419
|
+
if eval_pair[1] is None:
|
420
|
+
empty_target = pd.Series([np.nan] * len(eval_pair[0]), index=eval_pair[0].index)
|
421
|
+
eval_pair = (eval_pair[0], empty_target)
|
422
|
+
|
423
|
+
if not is_frames_equal(X, eval_pair[0], self.bundle):
|
409
424
|
checked_eval_set.append(eval_pair)
|
425
|
+
else:
|
426
|
+
msg = f"Eval set {i} is equal to train set and will be ignored"
|
427
|
+
self.logger.warning(msg)
|
428
|
+
print(msg)
|
410
429
|
return checked_eval_set
|
411
430
|
|
412
431
|
def fit(
|
413
432
|
self,
|
414
433
|
X: Union[pd.DataFrame, pd.Series, np.ndarray],
|
415
|
-
y: Union[pd.Series, np.ndarray,
|
416
|
-
eval_set: Optional[Union[
|
434
|
+
y: Union[pd.Series, np.ndarray, list],
|
435
|
+
eval_set: Optional[Union[list[tuple], tuple]] = None,
|
417
436
|
*args,
|
418
|
-
exclude_features_sources: Optional[
|
437
|
+
exclude_features_sources: Optional[list[str]] = None,
|
419
438
|
calculate_metrics: Optional[bool] = None,
|
420
439
|
estimator: Optional[Any] = None,
|
421
440
|
scoring: Union[Callable, str, None] = None,
|
422
|
-
importance_threshold: Optional[float] = None,
|
423
|
-
max_features: Optional[int] = None,
|
424
441
|
remove_outliers_calc_metrics: Optional[bool] = None,
|
425
442
|
progress_callback: Optional[Callable[[SearchProgress], Any]] = None,
|
426
443
|
search_id_callback: Optional[Callable[[str], Any]] = None,
|
427
444
|
select_features: bool = True,
|
428
445
|
auto_fe_parameters: Optional[AutoFEParameters] = None,
|
446
|
+
stability_threshold: float = 0.2,
|
447
|
+
stability_agg_func: str = "max",
|
429
448
|
**kwargs,
|
430
449
|
):
|
431
450
|
"""Fit to data.
|
@@ -440,14 +459,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
440
459
|
y: array-like of shape (n_samples,)
|
441
460
|
Target values.
|
442
461
|
|
443
|
-
eval_set:
|
444
|
-
|
445
|
-
|
446
|
-
importance_threshold: float, optional (default=None)
|
447
|
-
Minimum SHAP value to select a feature. Default value is 0.0.
|
448
|
-
|
449
|
-
max_features: int, optional (default=None)
|
450
|
-
Maximum number of most important features to select. If None, the number is unlimited.
|
462
|
+
eval_set: list[tuple], optional (default=None)
|
463
|
+
list of pairs (X, y) for validation.
|
451
464
|
|
452
465
|
calculate_metrics: bool, optional (default=None)
|
453
466
|
Whether to calculate and show metrics.
|
@@ -465,6 +478,13 @@ class FeaturesEnricher(TransformerMixin):
|
|
465
478
|
select_features: bool, optional (default=False)
|
466
479
|
If True, return only selected features both from input and data sources.
|
467
480
|
Otherwise, return all features from input and only selected features from data sources.
|
481
|
+
|
482
|
+
stability_threshold: float, optional (default=0.2)
|
483
|
+
Stability threshold for selected features PSI calculation. If PSI is less than this threshold,
|
484
|
+
then feature will be dropped.
|
485
|
+
|
486
|
+
stability_agg_func: str, optional (default="max")
|
487
|
+
Function to aggregate stability values. Can be "max", "min", "mean".
|
468
488
|
"""
|
469
489
|
trace_id = str(uuid.uuid4())
|
470
490
|
if self.print_trace_id:
|
@@ -500,7 +520,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
500
520
|
try:
|
501
521
|
self.X = X
|
502
522
|
self.y = y
|
503
|
-
self.eval_set = self._check_eval_set(eval_set, X
|
523
|
+
self.eval_set = self._check_eval_set(eval_set, X)
|
504
524
|
self.dump_input(trace_id, X, y, self.eval_set)
|
505
525
|
self.__set_select_features(select_features)
|
506
526
|
self.__inner_fit(
|
@@ -514,8 +534,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
514
534
|
calculate_metrics=calculate_metrics,
|
515
535
|
estimator=estimator,
|
516
536
|
scoring=scoring,
|
517
|
-
|
518
|
-
|
537
|
+
stability_threshold=stability_threshold,
|
538
|
+
stability_agg_func=stability_agg_func,
|
519
539
|
remove_outliers_calc_metrics=remove_outliers_calc_metrics,
|
520
540
|
auto_fe_parameters=auto_fe_parameters,
|
521
541
|
progress_callback=progress_callback,
|
@@ -560,13 +580,11 @@ class FeaturesEnricher(TransformerMixin):
|
|
560
580
|
def fit_transform(
|
561
581
|
self,
|
562
582
|
X: Union[pd.DataFrame, pd.Series, np.ndarray],
|
563
|
-
y: Union[pd.DataFrame, pd.Series, np.ndarray,
|
564
|
-
eval_set: Optional[Union[
|
583
|
+
y: Union[pd.DataFrame, pd.Series, np.ndarray, list],
|
584
|
+
eval_set: Optional[Union[list[tuple], tuple]] = None,
|
565
585
|
*args,
|
566
|
-
exclude_features_sources: Optional[
|
586
|
+
exclude_features_sources: Optional[list[str]] = None,
|
567
587
|
keep_input: bool = True,
|
568
|
-
importance_threshold: Optional[float] = None,
|
569
|
-
max_features: Optional[int] = None,
|
570
588
|
calculate_metrics: Optional[bool] = None,
|
571
589
|
scoring: Union[Callable, str, None] = None,
|
572
590
|
estimator: Optional[Any] = None,
|
@@ -574,6 +592,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
574
592
|
progress_callback: Optional[Callable[[SearchProgress], Any]] = None,
|
575
593
|
select_features: bool = True,
|
576
594
|
auto_fe_parameters: Optional[AutoFEParameters] = None,
|
595
|
+
stability_threshold: float = 0.2,
|
596
|
+
stability_agg_func: str = "max",
|
577
597
|
**kwargs,
|
578
598
|
) -> pd.DataFrame:
|
579
599
|
"""Fit to data, then transform it.
|
@@ -589,21 +609,12 @@ class FeaturesEnricher(TransformerMixin):
|
|
589
609
|
y: array-like of shape (n_samples,)
|
590
610
|
Target values.
|
591
611
|
|
592
|
-
eval_set:
|
593
|
-
|
612
|
+
eval_set: list[tuple], optional (default=None)
|
613
|
+
list of pairs (X, y) for validation.
|
594
614
|
|
595
615
|
keep_input: bool, optional (default=True)
|
596
616
|
If True, copy original input columns to the output dataframe.
|
597
617
|
|
598
|
-
importance_threshold: float, optional (default=None)
|
599
|
-
Minimum SHAP value to select a feature. Default value is 0.0.
|
600
|
-
|
601
|
-
max_features: int, optional (default=None)
|
602
|
-
Maximum number of most important features to select. If None, the number is unlimited.
|
603
|
-
|
604
|
-
calculate_metrics: bool, optional (default=None)
|
605
|
-
Whether to calculate and show metrics.
|
606
|
-
|
607
618
|
estimator: sklearn-compatible estimator, optional (default=None)
|
608
619
|
Custom estimator for metrics calculation.
|
609
620
|
|
@@ -618,6 +629,13 @@ class FeaturesEnricher(TransformerMixin):
|
|
618
629
|
If True, return only selected features both from input and data sources.
|
619
630
|
Otherwise, return all features from input and only selected features from data sources.
|
620
631
|
|
632
|
+
stability_threshold: float, optional (default=0.2)
|
633
|
+
Stability threshold for selected features PSI calculation. If PSI is less than this threshold,
|
634
|
+
then feature will be dropped.
|
635
|
+
|
636
|
+
stability_agg_func: str, optional (default="max")
|
637
|
+
Function to aggregate stability values. Can be "max", "min", "mean".
|
638
|
+
|
621
639
|
Returns
|
622
640
|
-------
|
623
641
|
X_new: pandas.DataFrame of shape (n_samples, n_features_new)
|
@@ -655,7 +673,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
655
673
|
try:
|
656
674
|
self.X = X
|
657
675
|
self.y = y
|
658
|
-
self.eval_set = self._check_eval_set(eval_set, X
|
676
|
+
self.eval_set = self._check_eval_set(eval_set, X)
|
659
677
|
self.__set_select_features(select_features)
|
660
678
|
self.dump_input(trace_id, X, y, self.eval_set)
|
661
679
|
|
@@ -673,8 +691,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
673
691
|
calculate_metrics=calculate_metrics,
|
674
692
|
scoring=scoring,
|
675
693
|
estimator=estimator,
|
676
|
-
|
677
|
-
|
694
|
+
stability_threshold=stability_threshold,
|
695
|
+
stability_agg_func=stability_agg_func,
|
678
696
|
remove_outliers_calc_metrics=remove_outliers_calc_metrics,
|
679
697
|
auto_fe_parameters=auto_fe_parameters,
|
680
698
|
progress_callback=progress_callback,
|
@@ -717,8 +735,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
717
735
|
X,
|
718
736
|
exclude_features_sources=exclude_features_sources,
|
719
737
|
keep_input=keep_input,
|
720
|
-
importance_threshold=importance_threshold,
|
721
|
-
max_features=max_features,
|
722
738
|
trace_id=trace_id,
|
723
739
|
silent_mode=True,
|
724
740
|
progress_bar=progress_bar,
|
@@ -732,10 +748,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
732
748
|
X: pd.DataFrame,
|
733
749
|
*args,
|
734
750
|
y: Optional[pd.Series] = None,
|
735
|
-
exclude_features_sources: Optional[
|
751
|
+
exclude_features_sources: Optional[list[str]] = None,
|
736
752
|
keep_input: bool = True,
|
737
|
-
importance_threshold: Optional[float] = None,
|
738
|
-
max_features: Optional[int] = None,
|
739
753
|
trace_id: Optional[str] = None,
|
740
754
|
metrics_calculation: bool = False,
|
741
755
|
silent_mode=False,
|
@@ -756,12 +770,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
756
770
|
keep_input: bool, optional (default=True)
|
757
771
|
If True, copy original input columns to the output dataframe.
|
758
772
|
|
759
|
-
importance_threshold: float, optional (default=None)
|
760
|
-
Minimum SHAP value to select a feature. Default value is 0.0.
|
761
|
-
|
762
|
-
max_features: int, optional (default=None)
|
763
|
-
Maximum number of most important features to select. If None, the number is unlimited.
|
764
|
-
|
765
773
|
Returns
|
766
774
|
-------
|
767
775
|
X_new: pandas.DataFrame of shape (n_samples, n_features_new)
|
@@ -798,8 +806,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
798
806
|
X,
|
799
807
|
y=y,
|
800
808
|
exclude_features_sources=exclude_features_sources,
|
801
|
-
importance_threshold=importance_threshold,
|
802
|
-
max_features=max_features,
|
803
809
|
metrics_calculation=metrics_calculation,
|
804
810
|
silent_mode=silent_mode,
|
805
811
|
progress_bar=progress_bar,
|
@@ -854,15 +860,13 @@ class FeaturesEnricher(TransformerMixin):
|
|
854
860
|
def calculate_metrics(
|
855
861
|
self,
|
856
862
|
X: Union[pd.DataFrame, pd.Series, np.ndarray, None] = None,
|
857
|
-
y: Union[pd.DataFrame, pd.Series, np.ndarray,
|
858
|
-
eval_set: Optional[Union[
|
863
|
+
y: Union[pd.DataFrame, pd.Series, np.ndarray, list, None] = None,
|
864
|
+
eval_set: Optional[Union[list[tuple], tuple]] = None,
|
859
865
|
*args,
|
860
866
|
scoring: Union[Callable, str, None] = None,
|
861
867
|
cv: Union[BaseCrossValidator, CVType, None] = None,
|
862
868
|
estimator=None,
|
863
|
-
exclude_features_sources: Optional[
|
864
|
-
importance_threshold: Optional[float] = None,
|
865
|
-
max_features: Optional[int] = None,
|
869
|
+
exclude_features_sources: Optional[list[str]] = None,
|
866
870
|
remove_outliers_calc_metrics: Optional[bool] = None,
|
867
871
|
trace_id: Optional[str] = None,
|
868
872
|
internal_call: bool = False,
|
@@ -880,8 +884,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
880
884
|
y: array-like of shape (n_samples,), optional (default=None)
|
881
885
|
Target values. If X not passed then y from fit will be used
|
882
886
|
|
883
|
-
eval_set:
|
884
|
-
|
887
|
+
eval_set: list[tuple], optional (default=None)
|
888
|
+
list of pairs (X, y) for validation. If X not passed then eval_set from fit will be used
|
885
889
|
|
886
890
|
scoring: string or callable, optional (default=None)
|
887
891
|
A string or a scorer callable object / function with signature scorer(estimator, X, y).
|
@@ -893,12 +897,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
893
897
|
estimator: sklearn-compatible estimator, optional (default=None)
|
894
898
|
Custom estimator for metrics calculation. If not passed then CatBoost will be used.
|
895
899
|
|
896
|
-
importance_threshold: float, optional (default=None)
|
897
|
-
Minimum SHAP value to select a feature. Default value is 0.0.
|
898
|
-
|
899
|
-
max_features: int, optional (default=None)
|
900
|
-
Maximum number of most important features to select. If None, the number is unlimited.
|
901
|
-
|
902
900
|
remove_outliers_calc_metrics, optional (default=True)
|
903
901
|
If True then rows with target ouliers will be dropped on metrics calculation
|
904
902
|
|
@@ -929,7 +927,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
929
927
|
effective_X = X if X is not None else self.X
|
930
928
|
effective_y = y if y is not None else self.y
|
931
929
|
effective_eval_set = eval_set if eval_set is not None else self.eval_set
|
932
|
-
effective_eval_set = self._check_eval_set(effective_eval_set, effective_X
|
930
|
+
effective_eval_set = self._check_eval_set(effective_eval_set, effective_X)
|
933
931
|
|
934
932
|
if (
|
935
933
|
self._search_task is None
|
@@ -941,7 +939,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
941
939
|
raise ValidationError(self.bundle.get("metrics_unfitted_enricher"))
|
942
940
|
|
943
941
|
validated_X, validated_y, validated_eval_set = self._validate_train_eval(
|
944
|
-
effective_X, effective_y, effective_eval_set
|
942
|
+
effective_X, effective_y, effective_eval_set, silent=internal_call
|
945
943
|
)
|
946
944
|
|
947
945
|
if self.X is None:
|
@@ -961,8 +959,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
961
959
|
validated_eval_set,
|
962
960
|
exclude_features_sources=exclude_features_sources,
|
963
961
|
cv=cv if cv is not None else self.cv,
|
964
|
-
importance_threshold=importance_threshold,
|
965
|
-
max_features=max_features,
|
966
962
|
scoring=scoring,
|
967
963
|
estimator=estimator,
|
968
964
|
remove_outliers_calc_metrics=remove_outliers_calc_metrics,
|
@@ -979,46 +975,43 @@ class FeaturesEnricher(TransformerMixin):
|
|
979
975
|
return None
|
980
976
|
|
981
977
|
cat_features_from_backend = self.__get_categorical_features()
|
978
|
+
# Convert to original names
|
979
|
+
cat_features_from_backend = [self.fit_columns_renaming.get(c, c) for c in cat_features_from_backend]
|
982
980
|
client_cat_features, search_keys_for_metrics = self._get_and_validate_client_cat_features(
|
983
981
|
estimator, validated_X, self.search_keys
|
984
982
|
)
|
983
|
+
# Exclude id columns from cat_features
|
985
984
|
if self.id_columns and self.id_columns_encoder is not None:
|
986
985
|
if cat_features_from_backend:
|
987
986
|
cat_features_from_backend = [
|
988
|
-
c
|
989
|
-
for c in cat_features_from_backend
|
990
|
-
if self.fit_columns_renaming.get(c, c) not in self.id_columns_encoder.feature_names_in_
|
987
|
+
c for c in cat_features_from_backend if c not in self.id_columns_encoder.feature_names_in_
|
991
988
|
]
|
992
989
|
if client_cat_features:
|
993
990
|
client_cat_features = [
|
994
|
-
c
|
995
|
-
for c in client_cat_features
|
996
|
-
if self.fit_columns_renaming.get(c, c) not in self.id_columns_encoder.feature_names_in_
|
991
|
+
c for c in client_cat_features if c not in self.id_columns_encoder.feature_names_in_
|
997
992
|
]
|
998
993
|
for cat_feature in cat_features_from_backend:
|
999
|
-
|
1000
|
-
|
1001
|
-
|
1002
|
-
search_keys_for_metrics.append(original_cat_feature)
|
994
|
+
if cat_feature in self.search_keys:
|
995
|
+
if self.search_keys[cat_feature] in [SearchKey.COUNTRY, SearchKey.POSTAL_CODE]:
|
996
|
+
search_keys_for_metrics.append(cat_feature)
|
1003
997
|
else:
|
1004
|
-
self.logger.warning(self.bundle.get("cat_feature_search_key").format(
|
998
|
+
self.logger.warning(self.bundle.get("cat_feature_search_key").format(cat_feature))
|
1005
999
|
search_keys_for_metrics.extend([c for c in self.id_columns or [] if c not in search_keys_for_metrics])
|
1006
1000
|
self.logger.info(f"Search keys for metrics: {search_keys_for_metrics}")
|
1007
1001
|
|
1008
|
-
prepared_data = self.
|
1002
|
+
prepared_data = self._get_cached_enriched_data(
|
1009
1003
|
trace_id=trace_id,
|
1010
1004
|
X=X,
|
1011
1005
|
y=y,
|
1012
1006
|
eval_set=eval_set,
|
1013
1007
|
exclude_features_sources=exclude_features_sources,
|
1014
|
-
importance_threshold=importance_threshold,
|
1015
|
-
max_features=max_features,
|
1016
1008
|
remove_outliers_calc_metrics=remove_outliers_calc_metrics,
|
1017
1009
|
cv_override=cv,
|
1018
1010
|
search_keys_for_metrics=search_keys_for_metrics,
|
1019
1011
|
progress_bar=progress_bar,
|
1020
1012
|
progress_callback=progress_callback,
|
1021
1013
|
client_cat_features=client_cat_features,
|
1014
|
+
is_for_metrics=True,
|
1022
1015
|
)
|
1023
1016
|
if prepared_data is None:
|
1024
1017
|
return None
|
@@ -1033,23 +1026,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
1033
1026
|
groups,
|
1034
1027
|
_cv,
|
1035
1028
|
columns_renaming,
|
1029
|
+
_,
|
1036
1030
|
) = prepared_data
|
1037
1031
|
|
1038
|
-
# rename cat_features
|
1039
|
-
if client_cat_features:
|
1040
|
-
for new_c, old_c in columns_renaming.items():
|
1041
|
-
if old_c in client_cat_features:
|
1042
|
-
client_cat_features.remove(old_c)
|
1043
|
-
client_cat_features.append(new_c)
|
1044
|
-
for cat_feature in client_cat_features:
|
1045
|
-
if cat_feature not in fitting_X.columns:
|
1046
|
-
self.logger.error(
|
1047
|
-
f"Client cat_feature `{cat_feature}` not found in"
|
1048
|
-
f" x columns: {fitting_X.columns.to_list()}"
|
1049
|
-
)
|
1050
|
-
else:
|
1051
|
-
client_cat_features = []
|
1052
|
-
|
1053
1032
|
# rename baseline_score_column
|
1054
1033
|
reversed_renaming = {v: k for k, v in columns_renaming.items()}
|
1055
1034
|
baseline_score_column = self.baseline_score_column
|
@@ -1074,9 +1053,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
1074
1053
|
self._check_train_and_eval_target_distribution(y_sorted, fitting_eval_set_dict)
|
1075
1054
|
|
1076
1055
|
has_date = self._get_date_column(search_keys) is not None
|
1077
|
-
has_time = has_date and isinstance(_cv, TimeSeriesSplit) or isinstance(_cv, BlockedTimeSeriesSplit)
|
1078
1056
|
model_task_type = self.model_task_type or define_task(y_sorted, has_date, self.logger, silent=True)
|
1079
1057
|
cat_features = list(set(client_cat_features + cat_features_from_backend))
|
1058
|
+
has_time = has_date and isinstance(_cv, TimeSeriesSplit) or isinstance(_cv, BlockedTimeSeriesSplit)
|
1080
1059
|
baseline_cat_features = [f for f in cat_features if f in fitting_X.columns]
|
1081
1060
|
enriched_cat_features = [f for f in cat_features if f in fitting_enriched_X.columns]
|
1082
1061
|
if len(enriched_cat_features) < len(cat_features):
|
@@ -1196,8 +1175,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
1196
1175
|
# max_initial_eval_set_hit_rate = self._search_task.get_max_initial_eval_set_hit_rate_v2()
|
1197
1176
|
if len(fitting_eval_set_dict) > 0:
|
1198
1177
|
for idx in fitting_eval_set_dict.keys():
|
1199
|
-
# eval_hit_rate = max_initial_eval_set_hit_rate[idx + 1]
|
1200
|
-
|
1201
1178
|
(
|
1202
1179
|
eval_X_sorted,
|
1203
1180
|
eval_y_sorted,
|
@@ -1205,6 +1182,10 @@ class FeaturesEnricher(TransformerMixin):
|
|
1205
1182
|
enriched_eval_y_sorted,
|
1206
1183
|
) = fitting_eval_set_dict[idx]
|
1207
1184
|
|
1185
|
+
if eval_y_sorted.isna().all():
|
1186
|
+
# Skip OOT eval set
|
1187
|
+
continue
|
1188
|
+
|
1208
1189
|
if baseline_estimator is not None:
|
1209
1190
|
self.logger.info(
|
1210
1191
|
f"Calculate baseline {metric} on eval set {idx + 1} "
|
@@ -1247,17 +1228,14 @@ class FeaturesEnricher(TransformerMixin):
|
|
1247
1228
|
"quality_metrics_eval_segment"
|
1248
1229
|
).format(idx + 1),
|
1249
1230
|
self.bundle.get("quality_metrics_rows_header"): _num_samples(
|
1250
|
-
# effective_eval_set[idx][0]
|
1251
1231
|
# Use actually used for metrics dataset
|
1252
1232
|
eval_X_sorted
|
1253
1233
|
),
|
1254
|
-
# self.bundle.get("quality_metrics_match_rate_header"): eval_hit_rate,
|
1255
1234
|
}
|
1256
1235
|
if model_task_type in [ModelTaskType.BINARY, ModelTaskType.REGRESSION] and is_numeric_dtype(
|
1257
1236
|
eval_y_sorted
|
1258
1237
|
):
|
1259
1238
|
eval_metrics[self.bundle.get("quality_metrics_mean_target_header")] = round(
|
1260
|
-
# np.mean(validated_eval_set[idx][1]), 4
|
1261
1239
|
# Use actually used for metrics dataset
|
1262
1240
|
np.mean(eval_y_sorted),
|
1263
1241
|
4,
|
@@ -1279,7 +1257,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
1279
1257
|
metrics.append(eval_metrics)
|
1280
1258
|
|
1281
1259
|
if updating_shaps is not None:
|
1282
|
-
decoded_X = self._decode_id_columns(fitting_X
|
1260
|
+
decoded_X = self._decode_id_columns(fitting_X)
|
1283
1261
|
self._update_shap_values(trace_id, decoded_X, updating_shaps, silent=not internal_call)
|
1284
1262
|
|
1285
1263
|
metrics_df = pd.DataFrame(metrics)
|
@@ -1330,7 +1308,188 @@ class FeaturesEnricher(TransformerMixin):
|
|
1330
1308
|
finally:
|
1331
1309
|
self.logger.info(f"Calculating metrics elapsed time: {time.time() - start_time}")
|
1332
1310
|
|
1333
|
-
def
|
1311
|
+
def _select_features_by_psi(
|
1312
|
+
self,
|
1313
|
+
trace_id: str,
|
1314
|
+
X: Union[pd.DataFrame, pd.Series, np.ndarray],
|
1315
|
+
y: Union[pd.DataFrame, pd.Series, np.ndarray, list],
|
1316
|
+
eval_set: Optional[Union[list[tuple], tuple]],
|
1317
|
+
stability_threshold: float,
|
1318
|
+
stability_agg_func: Callable,
|
1319
|
+
cv: Union[BaseCrossValidator, CVType, str, None] = None,
|
1320
|
+
estimator=None,
|
1321
|
+
exclude_features_sources: Optional[list[str]] = None,
|
1322
|
+
progress_bar: bool = True,
|
1323
|
+
progress_callback: Optional[Callable] = None,
|
1324
|
+
):
|
1325
|
+
search_keys = self.search_keys.copy()
|
1326
|
+
validated_X, _, validated_eval_set = self._validate_train_eval(X, y, eval_set, silent=True)
|
1327
|
+
if isinstance(X, np.ndarray):
|
1328
|
+
search_keys = {str(k): v for k, v in search_keys.items()}
|
1329
|
+
|
1330
|
+
date_column = self._get_date_column(search_keys)
|
1331
|
+
has_date = date_column is not None
|
1332
|
+
if not has_date:
|
1333
|
+
self.logger.info("No date column for OOT PSI calculation")
|
1334
|
+
return
|
1335
|
+
if not validated_eval_set:
|
1336
|
+
self.logger.info("No eval set for OOT PSI calculation")
|
1337
|
+
return
|
1338
|
+
if validated_X[date_column].nunique() <= 1:
|
1339
|
+
self.logger.warning("Constant date for OOT PSI calculation")
|
1340
|
+
return
|
1341
|
+
if self.cv is not None and self.cv.is_time_series():
|
1342
|
+
self.logger.warning("Time series CV is not supported for OOT PSI calculation")
|
1343
|
+
return
|
1344
|
+
|
1345
|
+
cat_features_from_backend = self.__get_categorical_features()
|
1346
|
+
cat_features_from_backend = [self.fit_columns_renaming.get(c, c) for c in cat_features_from_backend]
|
1347
|
+
client_cat_features, search_keys_for_metrics = self._get_and_validate_client_cat_features(
|
1348
|
+
estimator, validated_X, search_keys
|
1349
|
+
)
|
1350
|
+
if self.id_columns and self.id_columns_encoder is not None:
|
1351
|
+
if cat_features_from_backend:
|
1352
|
+
cat_features_from_backend = [
|
1353
|
+
c for c in cat_features_from_backend if c not in self.id_columns_encoder.feature_names_in_
|
1354
|
+
]
|
1355
|
+
if client_cat_features:
|
1356
|
+
client_cat_features = [
|
1357
|
+
c for c in client_cat_features if c not in self.id_columns_encoder.feature_names_in_
|
1358
|
+
]
|
1359
|
+
|
1360
|
+
prepared_data = self._get_cached_enriched_data(
|
1361
|
+
trace_id=trace_id,
|
1362
|
+
X=X,
|
1363
|
+
y=y,
|
1364
|
+
eval_set=eval_set,
|
1365
|
+
exclude_features_sources=exclude_features_sources,
|
1366
|
+
remove_outliers_calc_metrics=False,
|
1367
|
+
cv_override=cv,
|
1368
|
+
search_keys_for_metrics=search_keys_for_metrics,
|
1369
|
+
progress_bar=progress_bar,
|
1370
|
+
progress_callback=progress_callback,
|
1371
|
+
client_cat_features=client_cat_features,
|
1372
|
+
)
|
1373
|
+
if prepared_data is None:
|
1374
|
+
return None
|
1375
|
+
|
1376
|
+
(
|
1377
|
+
validated_X,
|
1378
|
+
_,
|
1379
|
+
y_sorted,
|
1380
|
+
_,
|
1381
|
+
_,
|
1382
|
+
fitting_eval_set_dict,
|
1383
|
+
_,
|
1384
|
+
_,
|
1385
|
+
_,
|
1386
|
+
_,
|
1387
|
+
eval_set_dates,
|
1388
|
+
) = prepared_data
|
1389
|
+
|
1390
|
+
model_task_type = self.model_task_type or define_task(y_sorted, has_date, self.logger, silent=True)
|
1391
|
+
cat_features = list(set(client_cat_features + cat_features_from_backend))
|
1392
|
+
|
1393
|
+
# Drop unstable features
|
1394
|
+
unstable_features = self._check_stability(
|
1395
|
+
validated_X,
|
1396
|
+
validated_eval_set,
|
1397
|
+
fitting_eval_set_dict,
|
1398
|
+
eval_set_dates,
|
1399
|
+
search_keys,
|
1400
|
+
stability_threshold,
|
1401
|
+
stability_agg_func,
|
1402
|
+
cat_features,
|
1403
|
+
model_task_type,
|
1404
|
+
)
|
1405
|
+
|
1406
|
+
if unstable_features:
|
1407
|
+
msg = f"{len(unstable_features)} feature(s) are unstable: {unstable_features} and will be dropped"
|
1408
|
+
self.logger.warning(msg)
|
1409
|
+
print(msg)
|
1410
|
+
|
1411
|
+
def _check_stability(
|
1412
|
+
self,
|
1413
|
+
X: pd.DataFrame,
|
1414
|
+
eval_set: list[tuple[pd.DataFrame, pd.Series]],
|
1415
|
+
enriched_eval_set: dict,
|
1416
|
+
eval_set_dates: dict[int, pd.Series],
|
1417
|
+
search_keys: dict[str, SearchKey],
|
1418
|
+
stability_threshold: float,
|
1419
|
+
stability_agg_func: str | None,
|
1420
|
+
cat_features: list[str],
|
1421
|
+
model_task_type: ModelTaskType,
|
1422
|
+
) -> list[str]:
|
1423
|
+
# Find latest eval set or earliest if all eval sets are before train set
|
1424
|
+
date_column = self._get_date_column(search_keys)
|
1425
|
+
|
1426
|
+
# Get minimum date from main dataset X
|
1427
|
+
main_min_date = X[date_column].dropna().min()
|
1428
|
+
|
1429
|
+
# Find minimum date for each eval_set and compare with main dataset
|
1430
|
+
eval_dates = []
|
1431
|
+
for i, (eval_x, _) in enumerate(eval_set):
|
1432
|
+
if date_column in eval_x.columns:
|
1433
|
+
if len(eval_x) < 1000:
|
1434
|
+
self.logger.warning(f"Eval_set {i} has less than 1000 rows. It will be ignored for stability check")
|
1435
|
+
continue
|
1436
|
+
eval_min_date = eval_x[date_column].dropna().min()
|
1437
|
+
eval_max_date = eval_x[date_column].dropna().max()
|
1438
|
+
eval_dates.append((i, eval_min_date, eval_max_date))
|
1439
|
+
|
1440
|
+
if not eval_dates:
|
1441
|
+
return []
|
1442
|
+
|
1443
|
+
# Check if any eval_set has minimum date >= main dataset minimum date
|
1444
|
+
later_eval_sets = [(i, min_date, max_date) for i, min_date, max_date in eval_dates if min_date >= main_min_date]
|
1445
|
+
|
1446
|
+
if later_eval_sets:
|
1447
|
+
# If there are eval_sets with date >= main date, choose the one with highest maximum date
|
1448
|
+
selected_eval_set_idx = max(later_eval_sets, key=lambda x: x[2])[0]
|
1449
|
+
else:
|
1450
|
+
# If all eval_sets have dates < main date, choose the one with lowest minimux date
|
1451
|
+
selected_eval_set_idx = max(eval_dates, key=lambda x: x[1])[0]
|
1452
|
+
|
1453
|
+
checking_eval_set = enriched_eval_set[selected_eval_set_idx]
|
1454
|
+
|
1455
|
+
checking_eval_set_df = (
|
1456
|
+
checking_eval_set[2]
|
1457
|
+
if checking_eval_set[1] is None or checking_eval_set[1].isna().all()
|
1458
|
+
else pd.concat([checking_eval_set[2], checking_eval_set[1].to_frame(TARGET)], axis=1)
|
1459
|
+
)
|
1460
|
+
checking_eval_set_df = checking_eval_set_df.copy()
|
1461
|
+
|
1462
|
+
checking_eval_set_df[date_column] = eval_set_dates[selected_eval_set_idx]
|
1463
|
+
|
1464
|
+
psi_values_sparse = calculate_sparsity_psi(
|
1465
|
+
checking_eval_set_df, cat_features, date_column, self.logger, model_task_type
|
1466
|
+
)
|
1467
|
+
|
1468
|
+
self.logger.info(f"PSI values by sparsity: {psi_values_sparse}")
|
1469
|
+
|
1470
|
+
unstable_by_sparsity = [feature for feature, psi in psi_values_sparse.items() if psi > stability_threshold]
|
1471
|
+
if unstable_by_sparsity:
|
1472
|
+
self.logger.info(f"Unstable by sparsity features: {sorted(unstable_by_sparsity)}")
|
1473
|
+
|
1474
|
+
psi_values = calculate_features_psi(
|
1475
|
+
checking_eval_set_df, cat_features, date_column, self.logger, model_task_type, stability_agg_func
|
1476
|
+
)
|
1477
|
+
|
1478
|
+
self.logger.info(f"PSI values by value: {psi_values}")
|
1479
|
+
|
1480
|
+
unstable_by_value = [feature for feature, psi in psi_values.items() if psi > stability_threshold]
|
1481
|
+
if unstable_by_value:
|
1482
|
+
self.logger.info(f"Unstable by value features: {sorted(unstable_by_value)}")
|
1483
|
+
|
1484
|
+
self.psi_values = {
|
1485
|
+
feature: psi_value for feature, psi_value in psi_values.items() if psi_value <= stability_threshold
|
1486
|
+
}
|
1487
|
+
|
1488
|
+
total_unstable_features = sorted(set(unstable_by_sparsity + unstable_by_value))
|
1489
|
+
|
1490
|
+
return total_unstable_features
|
1491
|
+
|
1492
|
+
def _update_shap_values(self, trace_id: str, df: pd.DataFrame, new_shaps: dict[str, float], silent: bool = False):
|
1334
1493
|
renaming = self.fit_columns_renaming or {}
|
1335
1494
|
self.logger.info(f"Updating SHAP values: {new_shaps}")
|
1336
1495
|
new_shaps = {
|
@@ -1347,7 +1506,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
1347
1506
|
display_html_dataframe(
|
1348
1507
|
self.features_info,
|
1349
1508
|
self._features_info_without_links,
|
1350
|
-
self.bundle.get("relevant_features_header"),
|
1509
|
+
self.bundle.get("relevant_features_header").format(len(self.feature_names_)),
|
1351
1510
|
display_handle=self.features_info_display_handle,
|
1352
1511
|
)
|
1353
1512
|
except (ImportError, NameError):
|
@@ -1398,13 +1557,13 @@ class FeaturesEnricher(TransformerMixin):
|
|
1398
1557
|
self.logger.warning(msg)
|
1399
1558
|
|
1400
1559
|
def _has_features_with_commercial_schema(
|
1401
|
-
self, commercial_schema: str, exclude_features_sources: Optional[
|
1560
|
+
self, commercial_schema: str, exclude_features_sources: Optional[list[str]]
|
1402
1561
|
) -> bool:
|
1403
1562
|
return len(self._get_features_with_commercial_schema(commercial_schema, exclude_features_sources)) > 0
|
1404
1563
|
|
1405
1564
|
def _get_features_with_commercial_schema(
|
1406
|
-
self, commercial_schema: str, exclude_features_sources: Optional[
|
1407
|
-
) ->
|
1565
|
+
self, commercial_schema: str, exclude_features_sources: Optional[list[str]]
|
1566
|
+
) -> list[str]:
|
1408
1567
|
if exclude_features_sources:
|
1409
1568
|
filtered_features_info = self._internal_features_info[
|
1410
1569
|
~self._internal_features_info[self.bundle.get("features_info_name")].isin(exclude_features_sources)
|
@@ -1418,19 +1577,19 @@ class FeaturesEnricher(TransformerMixin):
|
|
1418
1577
|
].values
|
1419
1578
|
)
|
1420
1579
|
|
1421
|
-
def _has_paid_features(self, exclude_features_sources: Optional[
|
1580
|
+
def _has_paid_features(self, exclude_features_sources: Optional[list[str]]) -> bool:
|
1422
1581
|
return self._has_features_with_commercial_schema(CommercialSchema.PAID.value, exclude_features_sources)
|
1423
1582
|
|
1424
1583
|
def _is_input_same_as_fit(
|
1425
1584
|
self,
|
1426
1585
|
X: Union[pd.DataFrame, pd.Series, np.ndarray, None] = None,
|
1427
|
-
y: Union[pd.DataFrame, pd.Series, np.ndarray,
|
1428
|
-
eval_set: Optional[
|
1429
|
-
) ->
|
1586
|
+
y: Union[pd.DataFrame, pd.Series, np.ndarray, list, None] = None,
|
1587
|
+
eval_set: Optional[list[tuple]] = None,
|
1588
|
+
) -> tuple:
|
1430
1589
|
if X is None:
|
1431
1590
|
return True, self.X, self.y, self.eval_set
|
1432
1591
|
|
1433
|
-
checked_eval_set = self._check_eval_set(eval_set, X
|
1592
|
+
checked_eval_set = self._check_eval_set(eval_set, X)
|
1434
1593
|
|
1435
1594
|
if (
|
1436
1595
|
X is self.X
|
@@ -1457,8 +1616,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
1457
1616
|
self,
|
1458
1617
|
X: pd.DataFrame,
|
1459
1618
|
cv_override: Union[BaseCrossValidator, CVType, str, None],
|
1460
|
-
search_keys:
|
1461
|
-
) ->
|
1619
|
+
search_keys: dict[str, SearchKey],
|
1620
|
+
) -> tuple[BaseCrossValidator, Optional[np.ndarray]]:
|
1462
1621
|
_cv = cv_override or self.cv
|
1463
1622
|
group_columns = sorted(self._get_group_columns(X, search_keys))
|
1464
1623
|
groups = None
|
@@ -1486,9 +1645,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
1486
1645
|
return _cv, groups
|
1487
1646
|
|
1488
1647
|
def _get_and_validate_client_cat_features(
|
1489
|
-
self, estimator: Optional[Any], X: pd.DataFrame, search_keys:
|
1490
|
-
) ->
|
1491
|
-
cat_features =
|
1648
|
+
self, estimator: Optional[Any], X: pd.DataFrame, search_keys: dict[str, SearchKey]
|
1649
|
+
) -> tuple[Optional[list[str]], list[str]]:
|
1650
|
+
cat_features = []
|
1492
1651
|
search_keys_for_metrics = []
|
1493
1652
|
if (
|
1494
1653
|
estimator is not None
|
@@ -1516,41 +1675,41 @@ class FeaturesEnricher(TransformerMixin):
|
|
1516
1675
|
raise ValidationError(self.bundle.get("cat_feature_search_key").format(cat_feature))
|
1517
1676
|
return cat_features, search_keys_for_metrics
|
1518
1677
|
|
1519
|
-
def
|
1678
|
+
def _get_cached_enriched_data(
|
1520
1679
|
self,
|
1521
1680
|
trace_id: str,
|
1522
1681
|
X: Union[pd.DataFrame, pd.Series, np.ndarray, None] = None,
|
1523
|
-
y: Union[pd.DataFrame, pd.Series, np.ndarray,
|
1524
|
-
eval_set: Optional[Union[
|
1525
|
-
exclude_features_sources: Optional[
|
1526
|
-
importance_threshold: Optional[float] = None,
|
1527
|
-
max_features: Optional[int] = None,
|
1682
|
+
y: Union[pd.DataFrame, pd.Series, np.ndarray, list, None] = None,
|
1683
|
+
eval_set: Optional[Union[list[tuple], tuple]] = None,
|
1684
|
+
exclude_features_sources: Optional[list[str]] = None,
|
1528
1685
|
remove_outliers_calc_metrics: Optional[bool] = None,
|
1529
1686
|
cv_override: Union[BaseCrossValidator, CVType, str, None] = None,
|
1530
|
-
search_keys_for_metrics: Optional[
|
1687
|
+
search_keys_for_metrics: Optional[list[str]] = None,
|
1531
1688
|
progress_bar: Optional[ProgressBar] = None,
|
1532
1689
|
progress_callback: Optional[Callable[[SearchProgress], Any]] = None,
|
1533
|
-
client_cat_features: Optional[
|
1690
|
+
client_cat_features: Optional[list[str]] = None,
|
1691
|
+
is_for_metrics: bool = False,
|
1534
1692
|
):
|
1535
1693
|
is_input_same_as_fit, X, y, eval_set = self._is_input_same_as_fit(X, y, eval_set)
|
1536
1694
|
is_demo_dataset = hash_input(X, y, eval_set) in DEMO_DATASET_HASHES
|
1537
|
-
checked_eval_set = self._check_eval_set(eval_set, X
|
1538
|
-
validated_X, validated_y, validated_eval_set = self._validate_train_eval(X, y, checked_eval_set)
|
1695
|
+
checked_eval_set = self._check_eval_set(eval_set, X)
|
1696
|
+
validated_X, validated_y, validated_eval_set = self._validate_train_eval(X, y, checked_eval_set, silent=True)
|
1539
1697
|
|
1540
|
-
sampled_data = self.
|
1541
|
-
trace_id,
|
1542
|
-
validated_X,
|
1543
|
-
validated_y,
|
1544
|
-
validated_eval_set,
|
1545
|
-
exclude_features_sources,
|
1546
|
-
is_input_same_as_fit,
|
1547
|
-
is_demo_dataset,
|
1548
|
-
remove_outliers_calc_metrics,
|
1549
|
-
progress_bar,
|
1550
|
-
progress_callback,
|
1698
|
+
sampled_data = self._get_enriched_datasets(
|
1699
|
+
trace_id=trace_id,
|
1700
|
+
validated_X=validated_X,
|
1701
|
+
validated_y=validated_y,
|
1702
|
+
eval_set=validated_eval_set,
|
1703
|
+
exclude_features_sources=exclude_features_sources,
|
1704
|
+
is_input_same_as_fit=is_input_same_as_fit,
|
1705
|
+
is_demo_dataset=is_demo_dataset,
|
1706
|
+
remove_outliers_calc_metrics=remove_outliers_calc_metrics,
|
1707
|
+
progress_bar=progress_bar,
|
1708
|
+
progress_callback=progress_callback,
|
1709
|
+
is_for_metrics=is_for_metrics,
|
1551
1710
|
)
|
1552
|
-
X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys, columns_renaming =
|
1553
|
-
sampled_data
|
1711
|
+
(X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys, columns_renaming, generated_features) = (
|
1712
|
+
dataclasses.astuple(sampled_data)
|
1554
1713
|
)
|
1555
1714
|
|
1556
1715
|
excluding_search_keys = list(search_keys.keys())
|
@@ -1566,14 +1725,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
1566
1725
|
|
1567
1726
|
client_features = [
|
1568
1727
|
c
|
1569
|
-
for c in
|
1570
|
-
if (
|
1571
|
-
|
1572
|
-
or c in set(self.feature_names_).union(self.id_columns or [])
|
1573
|
-
or (self.fit_columns_renaming or {}).get(c, c) in set(self.feature_names_).union(self.id_columns or [])
|
1574
|
-
)
|
1575
|
-
and c
|
1576
|
-
not in (
|
1728
|
+
for c in (validated_X.columns.to_list() + generated_features)
|
1729
|
+
if (not self.fit_select_features or c in set(self.feature_names_).union(self.id_columns or []))
|
1730
|
+
and c not in (
|
1577
1731
|
excluding_search_keys
|
1578
1732
|
+ list(self.fit_dropped_features)
|
1579
1733
|
+ [DateTimeSearchKeyConverter.DATETIME_COL, SYSTEM_RECORD_ID, ENTITY_SYSTEM_RECORD_ID]
|
@@ -1581,20 +1735,17 @@ class FeaturesEnricher(TransformerMixin):
|
|
1581
1735
|
]
|
1582
1736
|
self.logger.info(f"Client features column on prepare data for metrics: {client_features}")
|
1583
1737
|
|
1584
|
-
|
1585
|
-
importance_threshold, max_features, trace_id, validated_X
|
1586
|
-
)
|
1587
|
-
filtered_enriched_features = [c for c in filtered_enriched_features if c not in client_features]
|
1738
|
+
selected_enriched_features = [c for c in self.feature_names_ if c not in client_features]
|
1588
1739
|
|
1589
1740
|
X_sorted, y_sorted = self._sort_by_system_record_id(X_sampled, y_sampled, self.cv)
|
1590
1741
|
enriched_X_sorted, enriched_y_sorted = self._sort_by_system_record_id(enriched_X, y_sampled, self.cv)
|
1591
1742
|
|
1592
1743
|
cv, groups = self._get_cv_and_groups(enriched_X_sorted, cv_override, search_keys)
|
1593
1744
|
|
1594
|
-
|
1745
|
+
existing_selected_enriched_features = [c for c in selected_enriched_features if c in enriched_X_sorted.columns]
|
1595
1746
|
|
1596
1747
|
fitting_X = X_sorted[client_features].copy()
|
1597
|
-
fitting_enriched_X = enriched_X_sorted[client_features +
|
1748
|
+
fitting_enriched_X = enriched_X_sorted[client_features + existing_selected_enriched_features].copy()
|
1598
1749
|
|
1599
1750
|
renamed_generate_features = [columns_renaming.get(c, c) for c in (self.generate_features or [])]
|
1600
1751
|
renamed_client_cat_features = [columns_renaming.get(c, c) for c in (client_cat_features or [])]
|
@@ -1658,7 +1809,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
1658
1809
|
fitting_X, y_sorted, search_keys, self.model_task_type, sort_all_columns=True, logger=self.logger
|
1659
1810
|
)
|
1660
1811
|
fitting_X = fitting_X[fitting_x_columns]
|
1661
|
-
fitting_X, _ = self._encode_id_columns(fitting_X
|
1812
|
+
fitting_X, _ = self._encode_id_columns(fitting_X)
|
1662
1813
|
self.logger.info(f"Final sorted list of fitting X columns: {fitting_x_columns}")
|
1663
1814
|
fitting_enriched_x_columns = fitting_enriched_X.columns.to_list()
|
1664
1815
|
fitting_enriched_x_columns = sort_columns(
|
@@ -1670,14 +1821,18 @@ class FeaturesEnricher(TransformerMixin):
|
|
1670
1821
|
logger=self.logger,
|
1671
1822
|
)
|
1672
1823
|
fitting_enriched_X = fitting_enriched_X[fitting_enriched_x_columns]
|
1673
|
-
fitting_enriched_X, _ = self._encode_id_columns(fitting_enriched_X
|
1824
|
+
fitting_enriched_X, _ = self._encode_id_columns(fitting_enriched_X)
|
1674
1825
|
self.logger.info(f"Final sorted list of fitting enriched X columns: {fitting_enriched_x_columns}")
|
1826
|
+
date_column = self._get_date_column(search_keys)
|
1827
|
+
eval_set_dates = {}
|
1675
1828
|
for idx, eval_tuple in eval_set_sampled_dict.items():
|
1676
1829
|
eval_X_sampled, enriched_eval_X, eval_y_sampled = eval_tuple
|
1677
1830
|
eval_X_sorted, eval_y_sorted = self._sort_by_system_record_id(eval_X_sampled, eval_y_sampled, self.cv)
|
1678
1831
|
enriched_eval_X_sorted, enriched_eval_y_sorted = self._sort_by_system_record_id(
|
1679
1832
|
enriched_eval_X, eval_y_sampled, self.cv
|
1680
1833
|
)
|
1834
|
+
if date_column is not None:
|
1835
|
+
eval_set_dates[idx] = eval_X_sorted[date_column]
|
1681
1836
|
fitting_eval_X = eval_X_sorted[fitting_x_columns].copy()
|
1682
1837
|
fitting_enriched_eval_X = enriched_eval_X_sorted[fitting_enriched_x_columns].copy()
|
1683
1838
|
|
@@ -1698,8 +1853,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
1698
1853
|
.astype(np.float64)
|
1699
1854
|
)
|
1700
1855
|
|
1701
|
-
fitting_eval_X, unknown_dict = self._encode_id_columns(fitting_eval_X
|
1702
|
-
fitting_enriched_eval_X, _ = self._encode_id_columns(fitting_enriched_eval_X
|
1856
|
+
fitting_eval_X, unknown_dict = self._encode_id_columns(fitting_eval_X)
|
1857
|
+
fitting_enriched_eval_X, _ = self._encode_id_columns(fitting_enriched_eval_X)
|
1703
1858
|
|
1704
1859
|
if len(unknown_dict) > 0:
|
1705
1860
|
print(self.bundle.get("unknown_id_column_value_in_eval_set").format(unknown_dict))
|
@@ -1722,6 +1877,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
1722
1877
|
groups,
|
1723
1878
|
cv,
|
1724
1879
|
columns_renaming,
|
1880
|
+
eval_set_dates,
|
1725
1881
|
)
|
1726
1882
|
|
1727
1883
|
@dataclass
|
@@ -1729,29 +1885,31 @@ class FeaturesEnricher(TransformerMixin):
|
|
1729
1885
|
X_sampled: pd.DataFrame
|
1730
1886
|
y_sampled: pd.Series
|
1731
1887
|
enriched_X: pd.DataFrame
|
1732
|
-
eval_set_sampled_dict:
|
1733
|
-
search_keys:
|
1734
|
-
columns_renaming:
|
1888
|
+
eval_set_sampled_dict: dict[int, tuple[pd.DataFrame, pd.Series]]
|
1889
|
+
search_keys: dict[str, SearchKey]
|
1890
|
+
columns_renaming: dict[str, str]
|
1891
|
+
generated_features: list[str]
|
1735
1892
|
|
1736
|
-
def
|
1893
|
+
def _get_enriched_datasets(
|
1737
1894
|
self,
|
1738
1895
|
trace_id: str,
|
1739
1896
|
validated_X: Union[pd.DataFrame, pd.Series, np.ndarray, None],
|
1740
|
-
validated_y: Union[pd.DataFrame, pd.Series, np.ndarray,
|
1741
|
-
eval_set: Optional[
|
1742
|
-
exclude_features_sources: Optional[
|
1897
|
+
validated_y: Union[pd.DataFrame, pd.Series, np.ndarray, list, None],
|
1898
|
+
eval_set: Optional[list[tuple]],
|
1899
|
+
exclude_features_sources: Optional[list[str]],
|
1743
1900
|
is_input_same_as_fit: bool,
|
1744
1901
|
is_demo_dataset: bool,
|
1745
1902
|
remove_outliers_calc_metrics: Optional[bool],
|
1746
1903
|
progress_bar: Optional[ProgressBar],
|
1747
1904
|
progress_callback: Optional[Callable[[SearchProgress], Any]],
|
1905
|
+
is_for_metrics: bool = False,
|
1748
1906
|
) -> _EnrichedDataForMetrics:
|
1749
1907
|
datasets_hash = hash_input(validated_X, validated_y, eval_set)
|
1750
1908
|
cached_sampled_datasets = self.__cached_sampled_datasets.get(datasets_hash)
|
1751
1909
|
if cached_sampled_datasets is not None and is_input_same_as_fit and remove_outliers_calc_metrics is None:
|
1752
1910
|
self.logger.info("Cached enriched dataset found - use it")
|
1753
1911
|
return self.__get_sampled_cached_enriched(datasets_hash, exclude_features_sources)
|
1754
|
-
elif len(self.
|
1912
|
+
elif len(self.feature_names_) == 0 or all([f in validated_X.columns for f in self.feature_names_]):
|
1755
1913
|
self.logger.info("No external features selected. So use only input datasets for metrics calculation")
|
1756
1914
|
return self.__get_enriched_as_input(validated_X, validated_y, eval_set, is_demo_dataset)
|
1757
1915
|
# TODO save and check if dataset was deduplicated - use imbalance branch for such case
|
@@ -1777,12 +1935,13 @@ class FeaturesEnricher(TransformerMixin):
|
|
1777
1935
|
trace_id,
|
1778
1936
|
progress_bar,
|
1779
1937
|
progress_callback,
|
1938
|
+
is_for_metrics=is_for_metrics,
|
1780
1939
|
)
|
1781
1940
|
|
1782
1941
|
def __get_sampled_cached_enriched(
|
1783
|
-
self, datasets_hash: str, exclude_features_sources: Optional[
|
1942
|
+
self, datasets_hash: str, exclude_features_sources: Optional[list[str]]
|
1784
1943
|
) -> _EnrichedDataForMetrics:
|
1785
|
-
X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys, columns_renaming = (
|
1944
|
+
X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys, columns_renaming, generated_features = (
|
1786
1945
|
self.__cached_sampled_datasets[datasets_hash]
|
1787
1946
|
)
|
1788
1947
|
if exclude_features_sources:
|
@@ -1796,10 +1955,11 @@ class FeaturesEnricher(TransformerMixin):
|
|
1796
1955
|
eval_set_sampled_dict,
|
1797
1956
|
columns_renaming,
|
1798
1957
|
search_keys,
|
1958
|
+
generated_features,
|
1799
1959
|
)
|
1800
1960
|
|
1801
1961
|
def __get_enriched_as_input(
|
1802
|
-
self, validated_X: pd.DataFrame, validated_y: pd.Series, eval_set: Optional[
|
1962
|
+
self, validated_X: pd.DataFrame, validated_y: pd.Series, eval_set: Optional[list[tuple]], is_demo_dataset: bool
|
1803
1963
|
) -> _EnrichedDataForMetrics:
|
1804
1964
|
eval_set_sampled_dict = {}
|
1805
1965
|
|
@@ -1844,7 +2004,18 @@ class FeaturesEnricher(TransformerMixin):
|
|
1844
2004
|
columns_renaming = normalizer.columns_renaming
|
1845
2005
|
|
1846
2006
|
df, _ = clean_full_duplicates(df, logger=self.logger, bundle=self.bundle)
|
1847
|
-
df = self.
|
2007
|
+
df = self._add_fit_system_record_id(
|
2008
|
+
df,
|
2009
|
+
search_keys,
|
2010
|
+
SYSTEM_RECORD_ID,
|
2011
|
+
TARGET,
|
2012
|
+
columns_renaming,
|
2013
|
+
self.id_columns,
|
2014
|
+
self.cv,
|
2015
|
+
self.model_task_type,
|
2016
|
+
self.logger,
|
2017
|
+
self.bundle,
|
2018
|
+
)
|
1848
2019
|
|
1849
2020
|
# Sample after sorting by system_record_id for idempotency
|
1850
2021
|
df.sort_values(by=SYSTEM_RECORD_ID, inplace=True)
|
@@ -1853,6 +2024,10 @@ class FeaturesEnricher(TransformerMixin):
|
|
1853
2024
|
if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
|
1854
2025
|
df = df.drop(columns=DateTimeSearchKeyConverter.DATETIME_COL)
|
1855
2026
|
|
2027
|
+
df = df.rename(columns=columns_renaming)
|
2028
|
+
generated_features = [columns_renaming.get(c, c) for c in generated_features]
|
2029
|
+
search_keys = {columns_renaming.get(k, k): v for k, v in search_keys.items()}
|
2030
|
+
|
1856
2031
|
train_df = df.query(f"{EVAL_SET_INDEX} == 0") if eval_set is not None else df
|
1857
2032
|
X_sampled = train_df.drop(columns=[TARGET, EVAL_SET_INDEX], errors="ignore")
|
1858
2033
|
y_sampled = train_df[TARGET].copy()
|
@@ -1875,23 +2050,26 @@ class FeaturesEnricher(TransformerMixin):
|
|
1875
2050
|
eval_set_sampled_dict,
|
1876
2051
|
columns_renaming,
|
1877
2052
|
search_keys,
|
2053
|
+
generated_features,
|
1878
2054
|
)
|
1879
2055
|
|
1880
2056
|
def __get_enriched_from_fit(
|
1881
2057
|
self,
|
1882
|
-
eval_set: Optional[
|
2058
|
+
eval_set: Optional[list[tuple]],
|
1883
2059
|
trace_id: str,
|
1884
2060
|
remove_outliers_calc_metrics: Optional[bool],
|
1885
2061
|
) -> _EnrichedDataForMetrics:
|
1886
2062
|
eval_set_sampled_dict = {}
|
1887
|
-
search_keys = self.fit_search_keys
|
2063
|
+
search_keys = self.fit_search_keys.copy()
|
1888
2064
|
|
1889
2065
|
rows_to_drop = None
|
1890
2066
|
has_date = self._get_date_column(search_keys) is not None
|
1891
2067
|
self.model_task_type = self.model_task_type or define_task(
|
1892
2068
|
self.df_with_original_index[TARGET], has_date, self.logger, silent=True
|
1893
2069
|
)
|
1894
|
-
if
|
2070
|
+
if remove_outliers_calc_metrics is None:
|
2071
|
+
remove_outliers_calc_metrics = True
|
2072
|
+
if self.model_task_type == ModelTaskType.REGRESSION and remove_outliers_calc_metrics:
|
1895
2073
|
target_outliers_df = self._search_task.get_target_outliers(trace_id)
|
1896
2074
|
if target_outliers_df is not None and len(target_outliers_df) > 0:
|
1897
2075
|
outliers = pd.merge(
|
@@ -1901,11 +2079,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
1901
2079
|
how="inner",
|
1902
2080
|
)
|
1903
2081
|
top_outliers = outliers.sort_values(by=TARGET, ascending=False)[TARGET].head(3)
|
1904
|
-
|
1905
|
-
|
1906
|
-
not_msg = ""
|
1907
|
-
else:
|
1908
|
-
not_msg = "not "
|
2082
|
+
rows_to_drop = outliers
|
2083
|
+
not_msg = ""
|
1909
2084
|
msg = self.bundle.get("target_outliers_warning").format(len(target_outliers_df), top_outliers, not_msg)
|
1910
2085
|
print(msg)
|
1911
2086
|
self.logger.warning(msg)
|
@@ -1963,12 +2138,14 @@ class FeaturesEnricher(TransformerMixin):
|
|
1963
2138
|
enriched_eval_X = enriched_eval_sets[idx + 1][enriched_X_columns].copy()
|
1964
2139
|
eval_set_sampled_dict[idx] = (eval_X_sampled, enriched_eval_X, eval_y_sampled)
|
1965
2140
|
|
1966
|
-
reversed_renaming = {v: k for k, v in self.fit_columns_renaming.items()}
|
1967
|
-
X_sampled.rename(columns=
|
1968
|
-
enriched_X.rename(columns=
|
2141
|
+
# reversed_renaming = {v: k for k, v in self.fit_columns_renaming.items()}
|
2142
|
+
X_sampled.rename(columns=self.fit_columns_renaming, inplace=True)
|
2143
|
+
enriched_X.rename(columns=self.fit_columns_renaming, inplace=True)
|
1969
2144
|
for _, (eval_X_sampled, enriched_eval_X, _) in eval_set_sampled_dict.items():
|
1970
|
-
eval_X_sampled.rename(columns=
|
1971
|
-
enriched_eval_X.rename(columns=
|
2145
|
+
eval_X_sampled.rename(columns=self.fit_columns_renaming, inplace=True)
|
2146
|
+
enriched_eval_X.rename(columns=self.fit_columns_renaming, inplace=True)
|
2147
|
+
search_keys = {self.fit_columns_renaming.get(k, k): v for k, v in search_keys.items()}
|
2148
|
+
generated_features = [self.fit_columns_renaming.get(c, c) for c in self.fit_generated_features]
|
1972
2149
|
|
1973
2150
|
datasets_hash = hash_input(self.X, self.y, self.eval_set)
|
1974
2151
|
return self.__cache_and_return_results(
|
@@ -1979,17 +2156,19 @@ class FeaturesEnricher(TransformerMixin):
|
|
1979
2156
|
eval_set_sampled_dict,
|
1980
2157
|
self.fit_columns_renaming,
|
1981
2158
|
search_keys,
|
2159
|
+
generated_features,
|
1982
2160
|
)
|
1983
2161
|
|
1984
2162
|
def __get_enriched_from_transform(
|
1985
2163
|
self,
|
1986
2164
|
validated_X: pd.DataFrame,
|
1987
2165
|
validated_y: pd.Series,
|
1988
|
-
eval_set: Optional[
|
1989
|
-
exclude_features_sources: Optional[
|
2166
|
+
eval_set: Optional[list[tuple]],
|
2167
|
+
exclude_features_sources: Optional[list[str]],
|
1990
2168
|
trace_id: str,
|
1991
2169
|
progress_bar: Optional[ProgressBar],
|
1992
2170
|
progress_callback: Optional[Callable[[SearchProgress], Any]],
|
2171
|
+
is_for_metrics: bool = False,
|
1993
2172
|
) -> _EnrichedDataForMetrics:
|
1994
2173
|
has_eval_set = eval_set is not None
|
1995
2174
|
|
@@ -1997,6 +2176,16 @@ class FeaturesEnricher(TransformerMixin):
|
|
1997
2176
|
|
1998
2177
|
# Prepare
|
1999
2178
|
df = self.__combine_train_and_eval_sets(validated_X, validated_y, eval_set)
|
2179
|
+
|
2180
|
+
# Exclude OOT eval sets from transform because they are not used for metrics calculation
|
2181
|
+
if not is_for_metrics and EVAL_SET_INDEX in df.columns:
|
2182
|
+
for eval_index in df[EVAL_SET_INDEX].unique():
|
2183
|
+
if eval_index == 0:
|
2184
|
+
continue
|
2185
|
+
eval_df = df.query(f"{EVAL_SET_INDEX} == {eval_index}")
|
2186
|
+
if eval_df[TARGET].isna().all():
|
2187
|
+
df = df.query(f"{EVAL_SET_INDEX} != {eval_index}")
|
2188
|
+
|
2000
2189
|
df, _ = clean_full_duplicates(df, logger=self.logger, bundle=self.bundle)
|
2001
2190
|
df = self.__downsample_for_metrics(df)
|
2002
2191
|
|
@@ -2026,13 +2215,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
2026
2215
|
enriched_df, x_columns, enriched_X.columns.tolist(), len(eval_set) if has_eval_set else 0
|
2027
2216
|
)
|
2028
2217
|
|
2029
|
-
|
2030
|
-
reversed_renaming = {v: k for k, v in columns_renaming.items()}
|
2031
|
-
X_sampled.rename(columns=reversed_renaming, inplace=True)
|
2032
|
-
enriched_X.rename(columns=reversed_renaming, inplace=True)
|
2033
|
-
for _, (eval_X_sampled, enriched_eval_X, _) in eval_set_sampled_dict.items():
|
2034
|
-
eval_X_sampled.rename(columns=reversed_renaming, inplace=True)
|
2035
|
-
enriched_eval_X.rename(columns=reversed_renaming, inplace=True)
|
2218
|
+
search_keys = {columns_renaming.get(k, k): v for k, v in search_keys.items()}
|
2036
2219
|
|
2037
2220
|
# Cache and return results
|
2038
2221
|
datasets_hash = hash_input(validated_X, validated_y, eval_set)
|
@@ -2044,10 +2227,11 @@ class FeaturesEnricher(TransformerMixin):
|
|
2044
2227
|
eval_set_sampled_dict,
|
2045
2228
|
columns_renaming,
|
2046
2229
|
search_keys,
|
2230
|
+
generated_features,
|
2047
2231
|
)
|
2048
2232
|
|
2049
2233
|
def __combine_train_and_eval_sets(
|
2050
|
-
self, X: pd.DataFrame, y: Optional[pd.Series] = None, eval_set: Optional[
|
2234
|
+
self, X: pd.DataFrame, y: Optional[pd.Series] = None, eval_set: Optional[list[tuple]] = None
|
2051
2235
|
) -> pd.DataFrame:
|
2052
2236
|
df = X.copy()
|
2053
2237
|
if y is not None:
|
@@ -2099,8 +2283,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
2099
2283
|
)
|
2100
2284
|
|
2101
2285
|
def __extract_train_data(
|
2102
|
-
self, enriched_df: pd.DataFrame, x_columns:
|
2103
|
-
) ->
|
2286
|
+
self, enriched_df: pd.DataFrame, x_columns: list[str]
|
2287
|
+
) -> tuple[pd.DataFrame, pd.Series, pd.DataFrame]:
|
2104
2288
|
if EVAL_SET_INDEX in enriched_df.columns:
|
2105
2289
|
enriched_Xy = enriched_df.query(f"{EVAL_SET_INDEX} == 0")
|
2106
2290
|
else:
|
@@ -2111,8 +2295,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
2111
2295
|
return X_sampled, y_sampled, enriched_X
|
2112
2296
|
|
2113
2297
|
def __extract_eval_data(
|
2114
|
-
self, enriched_df: pd.DataFrame, x_columns:
|
2115
|
-
) ->
|
2298
|
+
self, enriched_df: pd.DataFrame, x_columns: list[str], enriched_X_columns: list[str], eval_set_len: int
|
2299
|
+
) -> tuple[dict[int, tuple], dict[int, pd.Series]]:
|
2116
2300
|
eval_set_sampled_dict = {}
|
2117
2301
|
|
2118
2302
|
for idx in range(eval_set_len):
|
@@ -2130,9 +2314,10 @@ class FeaturesEnricher(TransformerMixin):
|
|
2130
2314
|
X_sampled: pd.DataFrame,
|
2131
2315
|
y_sampled: pd.Series,
|
2132
2316
|
enriched_X: pd.DataFrame,
|
2133
|
-
eval_set_sampled_dict:
|
2134
|
-
columns_renaming:
|
2135
|
-
search_keys:
|
2317
|
+
eval_set_sampled_dict: dict[int, tuple],
|
2318
|
+
columns_renaming: dict[str, str],
|
2319
|
+
search_keys: dict[str, SearchKey],
|
2320
|
+
generated_features: list[str],
|
2136
2321
|
) -> _EnrichedDataForMetrics:
|
2137
2322
|
|
2138
2323
|
self.__cached_sampled_datasets[datasets_hash] = (
|
@@ -2142,10 +2327,11 @@ class FeaturesEnricher(TransformerMixin):
|
|
2142
2327
|
eval_set_sampled_dict,
|
2143
2328
|
search_keys,
|
2144
2329
|
columns_renaming,
|
2330
|
+
generated_features,
|
2145
2331
|
)
|
2146
2332
|
|
2147
2333
|
return self.__mk_sampled_data_tuple(
|
2148
|
-
X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys, columns_renaming
|
2334
|
+
X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys, columns_renaming, generated_features
|
2149
2335
|
)
|
2150
2336
|
|
2151
2337
|
def __mk_sampled_data_tuple(
|
@@ -2153,17 +2339,11 @@ class FeaturesEnricher(TransformerMixin):
|
|
2153
2339
|
X_sampled: pd.DataFrame,
|
2154
2340
|
y_sampled: pd.Series,
|
2155
2341
|
enriched_X: pd.DataFrame,
|
2156
|
-
eval_set_sampled_dict:
|
2157
|
-
search_keys:
|
2158
|
-
columns_renaming:
|
2342
|
+
eval_set_sampled_dict: dict,
|
2343
|
+
search_keys: dict,
|
2344
|
+
columns_renaming: dict[str, str],
|
2345
|
+
generated_features: list[str],
|
2159
2346
|
):
|
2160
|
-
# X_sampled - with hash-suffixes
|
2161
|
-
reversed_renaming = {v: k for k, v in columns_renaming.items()}
|
2162
|
-
search_keys = {
|
2163
|
-
reversed_renaming.get(k, k): v
|
2164
|
-
for k, v in search_keys.items()
|
2165
|
-
if reversed_renaming.get(k, k) in X_sampled.columns.to_list()
|
2166
|
-
}
|
2167
2347
|
return FeaturesEnricher._EnrichedDataForMetrics(
|
2168
2348
|
X_sampled=X_sampled,
|
2169
2349
|
y_sampled=y_sampled,
|
@@ -2171,6 +2351,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
2171
2351
|
eval_set_sampled_dict=eval_set_sampled_dict,
|
2172
2352
|
search_keys=search_keys,
|
2173
2353
|
columns_renaming=columns_renaming,
|
2354
|
+
generated_features=generated_features,
|
2174
2355
|
)
|
2175
2356
|
|
2176
2357
|
def get_search_id(self) -> Optional[str]:
|
@@ -2295,15 +2476,13 @@ if response.status_code == 200:
|
|
2295
2476
|
X: pd.DataFrame,
|
2296
2477
|
*,
|
2297
2478
|
y: Optional[pd.Series] = None,
|
2298
|
-
exclude_features_sources: Optional[
|
2299
|
-
importance_threshold: Optional[float] = None,
|
2300
|
-
max_features: Optional[int] = None,
|
2479
|
+
exclude_features_sources: Optional[list[str]] = None,
|
2301
2480
|
metrics_calculation: bool = False,
|
2302
2481
|
silent_mode: bool = False,
|
2303
2482
|
progress_bar: Optional[ProgressBar] = None,
|
2304
2483
|
progress_callback: Optional[Callable[[SearchProgress], Any]] = None,
|
2305
2484
|
add_fit_system_record_id: bool = False,
|
2306
|
-
) ->
|
2485
|
+
) -> tuple[pd.DataFrame, dict[str, str], list[str], dict[str, SearchKey]]:
|
2307
2486
|
if self._search_task is None:
|
2308
2487
|
raise NotFittedError(self.bundle.get("transform_unfitted_enricher"))
|
2309
2488
|
|
@@ -2313,7 +2492,7 @@ if response.status_code == 200:
|
|
2313
2492
|
self.logger.info("Start transform")
|
2314
2493
|
|
2315
2494
|
validated_X, validated_y, validated_eval_set = self._validate_train_eval(
|
2316
|
-
X, y, eval_set=None, is_transform=True
|
2495
|
+
X, y, eval_set=None, is_transform=True, silent=True
|
2317
2496
|
)
|
2318
2497
|
df = self.__combine_train_and_eval_sets(validated_X, validated_y, validated_eval_set)
|
2319
2498
|
|
@@ -2321,11 +2500,8 @@ if response.status_code == 200:
|
|
2321
2500
|
|
2322
2501
|
self.__log_debug_information(validated_X, validated_y, exclude_features_sources=exclude_features_sources)
|
2323
2502
|
|
2324
|
-
filtered_columns = self.__filtered_enriched_features(
|
2325
|
-
importance_threshold, max_features, trace_id, validated_X
|
2326
|
-
)
|
2327
2503
|
# If there are no important features, return original dataframe
|
2328
|
-
if
|
2504
|
+
if len(self.feature_names_) == 0:
|
2329
2505
|
msg = self.bundle.get("no_important_features_for_transform")
|
2330
2506
|
self.__log_warning(msg, show_support_link=True)
|
2331
2507
|
return X, {c: c for c in X.columns}, [], dict()
|
@@ -2415,7 +2591,7 @@ if response.status_code == 200:
|
|
2415
2591
|
else:
|
2416
2592
|
self.logger.info("Input dataset hasn't date column")
|
2417
2593
|
if self.__should_add_date_column():
|
2418
|
-
df = self._add_current_date_as_key(df, search_keys, self.
|
2594
|
+
df = self._add_current_date_as_key(df, search_keys, self.bundle, silent=True)
|
2419
2595
|
|
2420
2596
|
email_columns = SearchKey.find_all_keys(search_keys, SearchKey.EMAIL)
|
2421
2597
|
if email_columns and self.generate_search_key_features:
|
@@ -2432,9 +2608,9 @@ if response.status_code == 200:
|
|
2432
2608
|
if not external_features:
|
2433
2609
|
self.logger.warning(
|
2434
2610
|
"No external features found, returning original dataframe"
|
2435
|
-
f" with generated important features: {
|
2611
|
+
f" with generated important features: {self.feature_names_}"
|
2436
2612
|
)
|
2437
|
-
filtered_columns = [c for c in
|
2613
|
+
filtered_columns = [c for c in self.feature_names_ if c in df.columns]
|
2438
2614
|
self.logger.warning(f"Filtered columns by existance in dataframe: {filtered_columns}")
|
2439
2615
|
return df[filtered_columns], columns_renaming, generated_features, search_keys
|
2440
2616
|
|
@@ -2462,13 +2638,17 @@ if response.status_code == 200:
|
|
2462
2638
|
|
2463
2639
|
features_not_to_pass = []
|
2464
2640
|
if add_fit_system_record_id:
|
2465
|
-
df = self.
|
2641
|
+
df = self._add_fit_system_record_id(
|
2466
2642
|
df,
|
2467
2643
|
search_keys,
|
2468
2644
|
SYSTEM_RECORD_ID,
|
2469
2645
|
TARGET,
|
2470
2646
|
columns_renaming,
|
2471
|
-
|
2647
|
+
self.id_columns,
|
2648
|
+
self.cv,
|
2649
|
+
self.model_task_type,
|
2650
|
+
self.logger,
|
2651
|
+
self.bundle,
|
2472
2652
|
)
|
2473
2653
|
df = df.rename(columns={SYSTEM_RECORD_ID: SORT_ID})
|
2474
2654
|
features_not_to_pass.append(SORT_ID)
|
@@ -2568,16 +2748,6 @@ if response.status_code == 200:
|
|
2568
2748
|
)
|
2569
2749
|
dataset.columns_renaming = columns_renaming
|
2570
2750
|
|
2571
|
-
if max_features is not None or importance_threshold is not None:
|
2572
|
-
exclude_features_sources = list(
|
2573
|
-
set(
|
2574
|
-
(exclude_features_sources or [])
|
2575
|
-
+ self._get_excluded_features(max_features, importance_threshold)
|
2576
|
-
)
|
2577
|
-
)
|
2578
|
-
if len(exclude_features_sources) == 0:
|
2579
|
-
exclude_features_sources = None
|
2580
|
-
|
2581
2751
|
validation_task = self._search_task.validation(
|
2582
2752
|
trace_id,
|
2583
2753
|
dataset,
|
@@ -2642,6 +2812,8 @@ if response.status_code == 200:
|
|
2642
2812
|
print(self.bundle.get("transform_start"))
|
2643
2813
|
|
2644
2814
|
# Prepare input DataFrame for __enrich by concatenating generated ids and client features
|
2815
|
+
df_before_explode = df_before_explode.rename(columns=columns_renaming)
|
2816
|
+
generated_features = [columns_renaming.get(c, c) for c in generated_features]
|
2645
2817
|
combined_df = pd.concat(
|
2646
2818
|
[
|
2647
2819
|
validated_Xy.reset_index(drop=True),
|
@@ -2659,14 +2831,21 @@ if response.status_code == 200:
|
|
2659
2831
|
)
|
2660
2832
|
|
2661
2833
|
selected_generated_features = [
|
2662
|
-
c for c in generated_features if not self.fit_select_features or c in
|
2834
|
+
c for c in generated_features if not self.fit_select_features or c in self.feature_names_
|
2663
2835
|
]
|
2664
|
-
|
2836
|
+
selected_input_columns = [
|
2665
2837
|
c
|
2666
|
-
for c in
|
2667
|
-
if
|
2838
|
+
for c in validated_Xy.columns
|
2839
|
+
if not self.fit_select_features
|
2840
|
+
or c in self.feature_names_
|
2841
|
+
or c in self.search_keys
|
2842
|
+
or c in (self.id_columns or [])
|
2843
|
+
or c in [EVAL_SET_INDEX, TARGET] # transform for metrics calculation
|
2668
2844
|
]
|
2669
|
-
selecting_columns
|
2845
|
+
selecting_columns = selected_input_columns + selected_generated_features
|
2846
|
+
selecting_columns.extend(
|
2847
|
+
c for c in result.columns if c in self.feature_names_ and c not in selecting_columns
|
2848
|
+
)
|
2670
2849
|
if add_fit_system_record_id:
|
2671
2850
|
selecting_columns.append(SORT_ID)
|
2672
2851
|
|
@@ -2692,29 +2871,7 @@ if response.status_code == 200:
|
|
2692
2871
|
|
2693
2872
|
return result, columns_renaming, generated_features, search_keys
|
2694
2873
|
|
2695
|
-
def
|
2696
|
-
features_info = self._internal_features_info
|
2697
|
-
comm_schema_header = self.bundle.get("features_info_commercial_schema")
|
2698
|
-
shap_value_header = self.bundle.get("features_info_shap")
|
2699
|
-
feature_name_header = self.bundle.get("features_info_name")
|
2700
|
-
external_features = features_info[features_info[comm_schema_header].str.len() > 0]
|
2701
|
-
filtered_features = external_features
|
2702
|
-
if importance_threshold is not None:
|
2703
|
-
filtered_features = filtered_features[filtered_features[shap_value_header] >= importance_threshold]
|
2704
|
-
if max_features is not None and len(filtered_features) > max_features:
|
2705
|
-
filtered_features = filtered_features.iloc[:max_features, :]
|
2706
|
-
if len(filtered_features) == len(external_features):
|
2707
|
-
return []
|
2708
|
-
else:
|
2709
|
-
if len(filtered_features[filtered_features[comm_schema_header].isin([CommercialSchema.PAID.value])]):
|
2710
|
-
return []
|
2711
|
-
excluded_features = external_features[~external_features.index.isin(filtered_features.index)].copy()
|
2712
|
-
excluded_features = excluded_features[
|
2713
|
-
excluded_features[comm_schema_header].isin([CommercialSchema.PAID.value])
|
2714
|
-
]
|
2715
|
-
return excluded_features[feature_name_header].values.tolist()
|
2716
|
-
|
2717
|
-
def __validate_search_keys(self, search_keys: Dict[str, SearchKey], search_id: Optional[str] = None):
|
2874
|
+
def __validate_search_keys(self, search_keys: dict[str, SearchKey], search_id: Optional[str] = None):
|
2718
2875
|
if (search_keys is None or len(search_keys) == 0) and self.country_code is None:
|
2719
2876
|
if search_id:
|
2720
2877
|
self.logger.debug(f"search_id {search_id} provided without search_keys")
|
@@ -2788,17 +2945,17 @@ if response.status_code == 200:
|
|
2788
2945
|
self,
|
2789
2946
|
trace_id: str,
|
2790
2947
|
X: Union[pd.DataFrame, pd.Series, np.ndarray],
|
2791
|
-
y: Union[pd.DataFrame, pd.Series, np.ndarray,
|
2792
|
-
eval_set: Optional[
|
2948
|
+
y: Union[pd.DataFrame, pd.Series, np.ndarray, list, None],
|
2949
|
+
eval_set: Optional[list[tuple]],
|
2793
2950
|
progress_bar: Optional[ProgressBar],
|
2794
2951
|
start_time: int,
|
2795
2952
|
*,
|
2796
|
-
exclude_features_sources: Optional[
|
2953
|
+
exclude_features_sources: Optional[list[str]] = None,
|
2797
2954
|
calculate_metrics: Optional[bool],
|
2798
2955
|
scoring: Union[Callable, str, None],
|
2799
2956
|
estimator: Optional[Any],
|
2800
|
-
|
2801
|
-
|
2957
|
+
stability_threshold: float,
|
2958
|
+
stability_agg_func: str,
|
2802
2959
|
remove_outliers_calc_metrics: Optional[bool],
|
2803
2960
|
auto_fe_parameters: AutoFEParameters,
|
2804
2961
|
progress_callback: Optional[Callable[[SearchProgress], Any]] = None,
|
@@ -2812,6 +2969,7 @@ if response.status_code == 200:
|
|
2812
2969
|
self.fit_columns_renaming = None
|
2813
2970
|
self.fit_dropped_features = set()
|
2814
2971
|
self.fit_generated_features = []
|
2972
|
+
self.psi_values = None
|
2815
2973
|
|
2816
2974
|
validated_X, validated_y, validated_eval_set = self._validate_train_eval(X, y, eval_set)
|
2817
2975
|
|
@@ -2862,7 +3020,6 @@ if response.status_code == 200:
|
|
2862
3020
|
)
|
2863
3021
|
|
2864
3022
|
df = self.__combine_train_and_eval_sets(validated_X, validated_y, validated_eval_set)
|
2865
|
-
self.id_columns_encoder = OrdinalEncoder().fit(df[self.id_columns or []])
|
2866
3023
|
|
2867
3024
|
self.fit_search_keys = self.search_keys.copy()
|
2868
3025
|
df = self.__handle_index_search_keys(df, self.fit_search_keys)
|
@@ -2870,8 +3027,22 @@ if response.status_code == 200:
|
|
2870
3027
|
|
2871
3028
|
maybe_date_column = SearchKey.find_key(self.fit_search_keys, [SearchKey.DATE, SearchKey.DATETIME])
|
2872
3029
|
has_date = maybe_date_column is not None
|
3030
|
+
|
2873
3031
|
self.model_task_type = self.model_task_type or define_task(validated_y, has_date, self.logger)
|
2874
3032
|
|
3033
|
+
if EVAL_SET_INDEX in df.columns:
|
3034
|
+
only_train_df = df.query(f"{EVAL_SET_INDEX} == 0")
|
3035
|
+
only_train_df = only_train_df.drop(columns=[EVAL_SET_INDEX])
|
3036
|
+
else:
|
3037
|
+
only_train_df = df
|
3038
|
+
|
3039
|
+
self.imbalanced = is_imbalanced(only_train_df, self.model_task_type, self.sample_config, self.bundle)
|
3040
|
+
if self.imbalanced:
|
3041
|
+
# Exclude eval sets from fit because they will be transformed before metrics calculation
|
3042
|
+
df = only_train_df
|
3043
|
+
|
3044
|
+
self.id_columns_encoder = OrdinalEncoder().fit(df[self.id_columns or []])
|
3045
|
+
|
2875
3046
|
self._validate_binary_observations(validated_y, self.model_task_type)
|
2876
3047
|
|
2877
3048
|
self.runtime_parameters = get_runtime_params_custom_loss(
|
@@ -2908,7 +3079,7 @@ if response.status_code == 200:
|
|
2908
3079
|
self.logger.info("Input dataset hasn't date column")
|
2909
3080
|
# TODO remove when this logic will be implemented on the back
|
2910
3081
|
if self.__should_add_date_column():
|
2911
|
-
df = self._add_current_date_as_key(df, self.fit_search_keys, self.
|
3082
|
+
df = self._add_current_date_as_key(df, self.fit_search_keys, self.bundle)
|
2912
3083
|
|
2913
3084
|
email_columns = SearchKey.find_all_keys(self.fit_search_keys, SearchKey.EMAIL)
|
2914
3085
|
if email_columns and self.generate_search_key_features:
|
@@ -2923,10 +3094,13 @@ if response.status_code == 200:
|
|
2923
3094
|
except Exception:
|
2924
3095
|
self.logger.exception("Failed to check dates distribution validity")
|
2925
3096
|
|
3097
|
+
self.__adjust_cv(df)
|
3098
|
+
|
2926
3099
|
if (
|
2927
3100
|
is_numeric_dtype(df[self.TARGET_NAME])
|
2928
3101
|
and self.model_task_type in [ModelTaskType.BINARY, ModelTaskType.MULTICLASS]
|
2929
3102
|
and has_date
|
3103
|
+
and (self.cv is None or not self.cv.is_time_series())
|
2930
3104
|
):
|
2931
3105
|
self._validate_PSI(df.sort_values(by=maybe_date_column))
|
2932
3106
|
|
@@ -2958,7 +3132,15 @@ if response.status_code == 200:
|
|
2958
3132
|
|
2959
3133
|
self.fit_generated_features = [f for f in self.fit_generated_features if f not in self.fit_dropped_features]
|
2960
3134
|
|
2961
|
-
|
3135
|
+
# Group columns should have normalized names
|
3136
|
+
if self.runtime_parameters.properties.get("cv_params.group_columns") is not None:
|
3137
|
+
original_to_hash = {v: k for k, v in self.fit_columns_renaming.items()}
|
3138
|
+
self.runtime_parameters.properties["cv_params.group_columns"] = ",".join(
|
3139
|
+
[
|
3140
|
+
original_to_hash.get(c, c)
|
3141
|
+
for c in self.runtime_parameters.properties["cv_params.group_columns"].split(",")
|
3142
|
+
]
|
3143
|
+
)
|
2962
3144
|
|
2963
3145
|
if self.id_columns is not None and self.cv is not None and self.cv.is_time_series():
|
2964
3146
|
id_columns = self.__get_renamed_id_columns()
|
@@ -2984,9 +3166,27 @@ if response.status_code == 200:
|
|
2984
3166
|
else:
|
2985
3167
|
self.__log_warning(full_duplicates_warning)
|
2986
3168
|
|
3169
|
+
# Check if OOT eval set still more than 1000 rows
|
3170
|
+
if EVAL_SET_INDEX in df.columns:
|
3171
|
+
for eval_set_index in df[EVAL_SET_INDEX].unique():
|
3172
|
+
if eval_set_index == 0:
|
3173
|
+
continue
|
3174
|
+
eval_set_df = df[df[EVAL_SET_INDEX] == eval_set_index]
|
3175
|
+
if np.all(pd.isna(eval_set_df[TARGET])) and len(eval_set_df) < 1000:
|
3176
|
+
self.__log_warning(self.bundle.get("oot_eval_set_too_small_after_dedup").format(eval_set_index + 1))
|
3177
|
+
|
2987
3178
|
# Explode multiple search keys
|
2988
|
-
df = self.
|
2989
|
-
df,
|
3179
|
+
df = self._add_fit_system_record_id(
|
3180
|
+
df,
|
3181
|
+
self.fit_search_keys,
|
3182
|
+
ENTITY_SYSTEM_RECORD_ID,
|
3183
|
+
TARGET,
|
3184
|
+
self.fit_columns_renaming,
|
3185
|
+
self.id_columns,
|
3186
|
+
self.cv,
|
3187
|
+
self.model_task_type,
|
3188
|
+
self.logger,
|
3189
|
+
self.bundle,
|
2990
3190
|
)
|
2991
3191
|
|
2992
3192
|
# TODO check that this is correct for enrichment
|
@@ -3020,8 +3220,17 @@ if response.status_code == 200:
|
|
3020
3220
|
if eval_set is not None and len(eval_set) > 0:
|
3021
3221
|
meaning_types[EVAL_SET_INDEX] = FileColumnMeaningType.EVAL_SET_INDEX
|
3022
3222
|
|
3023
|
-
df = self.
|
3024
|
-
df,
|
3223
|
+
df = self._add_fit_system_record_id(
|
3224
|
+
df,
|
3225
|
+
self.fit_search_keys,
|
3226
|
+
SYSTEM_RECORD_ID,
|
3227
|
+
TARGET,
|
3228
|
+
self.fit_columns_renaming,
|
3229
|
+
self.id_columns,
|
3230
|
+
self.cv,
|
3231
|
+
self.model_task_type,
|
3232
|
+
self.logger,
|
3233
|
+
self.bundle,
|
3025
3234
|
)
|
3026
3235
|
|
3027
3236
|
if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
|
@@ -3049,6 +3258,7 @@ if response.status_code == 200:
|
|
3049
3258
|
model_task_type=self.model_task_type,
|
3050
3259
|
cv_type=self.cv,
|
3051
3260
|
id_columns=self.__get_renamed_id_columns(),
|
3261
|
+
is_imbalanced=self.imbalanced,
|
3052
3262
|
date_column=self._get_date_column(self.fit_search_keys),
|
3053
3263
|
date_format=self.date_format,
|
3054
3264
|
random_state=self.random_state,
|
@@ -3128,8 +3338,6 @@ if response.status_code == 200:
|
|
3128
3338
|
if progress_callback is not None:
|
3129
3339
|
progress_callback(progress)
|
3130
3340
|
|
3131
|
-
self.imbalanced = dataset.imbalanced
|
3132
|
-
|
3133
3341
|
zero_hit_search_keys = self._search_task.get_zero_hit_rate_search_keys()
|
3134
3342
|
if zero_hit_search_keys:
|
3135
3343
|
self.logger.warning(
|
@@ -3152,7 +3360,23 @@ if response.status_code == 200:
|
|
3152
3360
|
|
3153
3361
|
self.__prepare_feature_importances(trace_id, df)
|
3154
3362
|
|
3155
|
-
self.
|
3363
|
+
self._select_features_by_psi(
|
3364
|
+
trace_id=trace_id,
|
3365
|
+
X=X,
|
3366
|
+
y=y,
|
3367
|
+
eval_set=eval_set,
|
3368
|
+
stability_threshold=stability_threshold,
|
3369
|
+
stability_agg_func=stability_agg_func,
|
3370
|
+
cv=self.cv,
|
3371
|
+
estimator=estimator,
|
3372
|
+
exclude_features_sources=exclude_features_sources,
|
3373
|
+
progress_bar=progress_bar,
|
3374
|
+
progress_callback=progress_callback,
|
3375
|
+
)
|
3376
|
+
|
3377
|
+
self.__prepare_feature_importances(trace_id, df)
|
3378
|
+
|
3379
|
+
self.__show_selected_features()
|
3156
3380
|
|
3157
3381
|
autofe_description = self.get_autofe_features_description()
|
3158
3382
|
if autofe_description is not None and len(autofe_description) > 0:
|
@@ -3194,8 +3418,6 @@ if response.status_code == 200:
|
|
3194
3418
|
self.__show_metrics(
|
3195
3419
|
scoring,
|
3196
3420
|
estimator,
|
3197
|
-
importance_threshold,
|
3198
|
-
max_features,
|
3199
3421
|
remove_outliers_calc_metrics,
|
3200
3422
|
trace_id,
|
3201
3423
|
progress_bar,
|
@@ -3212,7 +3434,7 @@ if response.status_code == 200:
|
|
3212
3434
|
if not self.warning_counter.has_warnings():
|
3213
3435
|
self.__display_support_link(self.bundle.get("all_ok_community_invite"))
|
3214
3436
|
|
3215
|
-
def __convert_unnestable_keys(self, df: pd.DataFrame, unnest_search_keys:
|
3437
|
+
def __convert_unnestable_keys(self, df: pd.DataFrame, unnest_search_keys: dict[str, str]):
|
3216
3438
|
email_column = self._get_email_column(self.fit_search_keys)
|
3217
3439
|
hem_column = self._get_hem_column(self.fit_search_keys)
|
3218
3440
|
if email_column:
|
@@ -3244,26 +3466,29 @@ if response.status_code == 200:
|
|
3244
3466
|
def __should_add_date_column(self):
|
3245
3467
|
return self.add_date_if_missing or (self.cv is not None and self.cv.is_time_series())
|
3246
3468
|
|
3247
|
-
def __get_renamed_id_columns(self, renaming: Optional[
|
3469
|
+
def __get_renamed_id_columns(self, renaming: Optional[dict[str, str]] = None):
|
3248
3470
|
renaming = renaming or self.fit_columns_renaming
|
3249
3471
|
reverse_renaming = {v: k for k, v in renaming.items()}
|
3250
3472
|
return None if self.id_columns is None else [reverse_renaming.get(c) or c for c in self.id_columns]
|
3251
3473
|
|
3252
3474
|
def __adjust_cv(self, df: pd.DataFrame):
|
3253
|
-
|
3254
|
-
|
3255
|
-
|
3256
|
-
|
3257
|
-
|
3258
|
-
|
3259
|
-
|
3260
|
-
|
3261
|
-
|
3262
|
-
|
3263
|
-
|
3264
|
-
|
3265
|
-
|
3266
|
-
|
3475
|
+
if self.cv is None:
|
3476
|
+
date_column = SearchKey.find_key(self.fit_search_keys, [SearchKey.DATE, SearchKey.DATETIME])
|
3477
|
+
# Check Multivariate time series
|
3478
|
+
if (
|
3479
|
+
date_column
|
3480
|
+
and self.model_task_type == ModelTaskType.REGRESSION
|
3481
|
+
and len({SearchKey.PHONE, SearchKey.EMAIL, SearchKey.HEM}.intersection(self.fit_search_keys.keys()))
|
3482
|
+
== 0
|
3483
|
+
and is_blocked_time_series(df, date_column, list(self.fit_search_keys.keys()) + [TARGET])
|
3484
|
+
):
|
3485
|
+
msg = self.bundle.get("multivariate_timeseries_detected")
|
3486
|
+
self.__override_cv(CVType.blocked_time_series, msg, print_warning=False)
|
3487
|
+
elif self.model_task_type != ModelTaskType.REGRESSION:
|
3488
|
+
msg = self.bundle.get("group_k_fold_in_classification")
|
3489
|
+
self.__override_cv(CVType.group_k_fold, msg, print_warning=self.cv is not None)
|
3490
|
+
|
3491
|
+
if self.cv == CVType.group_k_fold:
|
3267
3492
|
group_columns = self._get_group_columns(df, self.fit_search_keys)
|
3268
3493
|
self.runtime_parameters.properties["cv_params.group_columns"] = ",".join(group_columns)
|
3269
3494
|
self.runtime_parameters.properties["cv_params.shuffle_kfold"] = "True"
|
@@ -3275,7 +3500,7 @@ if response.status_code == 200:
|
|
3275
3500
|
self.cv = cv
|
3276
3501
|
self.runtime_parameters.properties["cv_type"] = self.cv.name
|
3277
3502
|
|
3278
|
-
def get_columns_by_search_keys(self, keys:
|
3503
|
+
def get_columns_by_search_keys(self, keys: list[str]):
|
3279
3504
|
if "HEM" in keys:
|
3280
3505
|
keys.append("EMAIL")
|
3281
3506
|
if "DATE" in keys:
|
@@ -3287,50 +3512,44 @@ if response.status_code == 200:
|
|
3287
3512
|
self,
|
3288
3513
|
X: pd.DataFrame,
|
3289
3514
|
y: Optional[pd.Series] = None,
|
3290
|
-
eval_set: Optional[
|
3515
|
+
eval_set: Optional[list[tuple[pd.DataFrame, pd.Series]]] = None,
|
3291
3516
|
is_transform: bool = False,
|
3292
|
-
|
3517
|
+
silent: bool = False,
|
3518
|
+
) -> tuple[pd.DataFrame, pd.Series, Optional[list[tuple[pd.DataFrame, pd.Series]]]]:
|
3293
3519
|
validated_X = self._validate_X(X, is_transform)
|
3294
3520
|
validated_y = self._validate_y(validated_X, y, enforce_y=not is_transform)
|
3295
|
-
validated_eval_set = self._validate_eval_set(validated_X, eval_set)
|
3521
|
+
validated_eval_set = self._validate_eval_set(validated_X, eval_set, silent=silent)
|
3296
3522
|
return validated_X, validated_y, validated_eval_set
|
3297
3523
|
|
3298
3524
|
def _encode_id_columns(
|
3299
3525
|
self,
|
3300
3526
|
X: pd.DataFrame,
|
3301
|
-
|
3302
|
-
) -> Tuple[pd.DataFrame, Dict[str, List[Any]]]:
|
3303
|
-
columns_renaming = columns_renaming or {}
|
3527
|
+
) -> tuple[pd.DataFrame, dict[str, list[Any]]]:
|
3304
3528
|
unknown_dict = {}
|
3305
3529
|
|
3306
3530
|
if self.id_columns and self.id_columns_encoder is not None:
|
3307
|
-
|
3308
|
-
|
3309
|
-
|
3310
|
-
|
3311
|
-
|
3312
|
-
|
3313
|
-
|
3314
|
-
|
3315
|
-
|
3316
|
-
|
3317
|
-
|
3318
|
-
|
3319
|
-
|
3320
|
-
if len(unknown_dict) > 0:
|
3321
|
-
self.logger.warning(f"Unknown values in id columns: {unknown_dict}")
|
3531
|
+
encoding_id_columns = [c for c in self.id_columns if c in X.columns]
|
3532
|
+
if len(encoding_id_columns) > 0:
|
3533
|
+
self.logger.info(f"Convert id columns to int: {encoding_id_columns}")
|
3534
|
+
encoded = self.id_columns_encoder.transform(X[encoding_id_columns])
|
3535
|
+
for i, c in enumerate(encoding_id_columns):
|
3536
|
+
unknown_values = X[encoded[:, i] == -1][c].unique().tolist()
|
3537
|
+
if len(unknown_values) > 0:
|
3538
|
+
unknown_dict[c] = unknown_values
|
3539
|
+
X[encoding_id_columns] = encoded
|
3540
|
+
X = X.loc[(X[encoding_id_columns] != -1).all(axis=1)]
|
3541
|
+
|
3542
|
+
if len(unknown_dict) > 0:
|
3543
|
+
self.logger.warning(f"Unknown values in id columns: {unknown_dict}")
|
3322
3544
|
|
3323
3545
|
return X, unknown_dict
|
3324
3546
|
|
3325
|
-
def _decode_id_columns(self, X: pd.DataFrame
|
3326
|
-
columns_renaming = columns_renaming or {}
|
3547
|
+
def _decode_id_columns(self, X: pd.DataFrame):
|
3327
3548
|
if self.id_columns and self.id_columns_encoder is not None:
|
3328
|
-
|
3329
|
-
|
3330
|
-
|
3331
|
-
|
3332
|
-
decoded = self.id_columns_encoder.inverse_transform(X[renamed_id_columns].rename(columns=columns_renaming))
|
3333
|
-
X[renamed_id_columns] = decoded
|
3549
|
+
decoding_id_columns = [c for c in self.id_columns if c in X.columns]
|
3550
|
+
if len(decoding_id_columns) > 0:
|
3551
|
+
decoded = self.id_columns_encoder.inverse_transform(X[self.id_columns])
|
3552
|
+
X[self.id_columns] = decoded
|
3334
3553
|
|
3335
3554
|
return X
|
3336
3555
|
|
@@ -3424,12 +3643,32 @@ if response.status_code == 200:
|
|
3424
3643
|
|
3425
3644
|
return validated_y
|
3426
3645
|
|
3427
|
-
def _validate_eval_set(
|
3646
|
+
def _validate_eval_set(
|
3647
|
+
self, X: pd.DataFrame, eval_set: Optional[list[tuple[pd.DataFrame, pd.Series]]], silent: bool = False
|
3648
|
+
):
|
3428
3649
|
if eval_set is None:
|
3429
3650
|
return None
|
3430
|
-
|
3651
|
+
validated_eval_set = []
|
3652
|
+
has_date = self._get_date_column(self.search_keys) is not None
|
3653
|
+
for idx, eval_pair in enumerate(eval_set):
|
3654
|
+
validated_pair = self._validate_eval_set_pair(X, eval_pair)
|
3655
|
+
if validated_pair[1].isna().all():
|
3656
|
+
if not has_date:
|
3657
|
+
msg = self.bundle.get("oot_without_date_not_supported").format(idx + 1)
|
3658
|
+
elif self.columns_for_online_api:
|
3659
|
+
msg = self.bundle.get("oot_with_online_sources_not_supported").format(idx + 1)
|
3660
|
+
else:
|
3661
|
+
msg = None
|
3662
|
+
if msg:
|
3663
|
+
if not silent:
|
3664
|
+
print(msg)
|
3665
|
+
self.logger.warning(msg)
|
3666
|
+
continue
|
3667
|
+
validated_eval_set.append(validated_pair)
|
3431
3668
|
|
3432
|
-
|
3669
|
+
return validated_eval_set
|
3670
|
+
|
3671
|
+
def _validate_eval_set_pair(self, X: pd.DataFrame, eval_pair: tuple) -> tuple[pd.DataFrame, pd.Series]:
|
3433
3672
|
if len(eval_pair) != 2:
|
3434
3673
|
raise ValidationError(self.bundle.get("eval_set_invalid_tuple_size").format(len(eval_pair)))
|
3435
3674
|
eval_X, eval_y = eval_pair
|
@@ -3502,20 +3741,22 @@ if response.status_code == 200:
|
|
3502
3741
|
raise ValidationError(self.bundle.get("unsupported_y_type_eval_set").format(type(eval_y)))
|
3503
3742
|
|
3504
3743
|
eval_y_nunique = validated_eval_y.nunique()
|
3505
|
-
|
3744
|
+
is_oot = validated_eval_y.isna().all()
|
3745
|
+
if not is_oot and eval_y_nunique < 2:
|
3506
3746
|
raise ValidationError(self.bundle.get("y_is_constant_eval_set"))
|
3507
3747
|
|
3508
|
-
if self.model_task_type == ModelTaskType.BINARY and eval_y_nunique != 2:
|
3748
|
+
if not is_oot and self.model_task_type == ModelTaskType.BINARY and eval_y_nunique != 2:
|
3509
3749
|
raise ValidationError(self.bundle.get("binary_target_eval_unique_count_not_2").format(eval_y_nunique))
|
3510
3750
|
|
3511
|
-
|
3512
|
-
|
3513
|
-
|
3514
|
-
|
3751
|
+
if not is_oot:
|
3752
|
+
# Check for duplicates between train and eval sets by comparing all values
|
3753
|
+
train_eval_intersection = pd.merge(X, validated_eval_X, how="inner")
|
3754
|
+
if len(train_eval_intersection) > 0:
|
3755
|
+
raise ValidationError(self.bundle.get("eval_x_has_train_samples"))
|
3515
3756
|
|
3516
3757
|
return validated_eval_X, validated_eval_y
|
3517
3758
|
|
3518
|
-
def _validate_baseline_score(self, X: pd.DataFrame, eval_set: Optional[
|
3759
|
+
def _validate_baseline_score(self, X: pd.DataFrame, eval_set: Optional[list[tuple]]):
|
3519
3760
|
if self.baseline_score_column is not None:
|
3520
3761
|
if self.baseline_score_column not in X.columns:
|
3521
3762
|
raise ValidationError(
|
@@ -3527,13 +3768,15 @@ if response.status_code == 200:
|
|
3527
3768
|
if isinstance(eval_set, tuple):
|
3528
3769
|
eval_set = [eval_set]
|
3529
3770
|
for eval in eval_set:
|
3530
|
-
|
3531
|
-
|
3532
|
-
|
3533
|
-
|
3771
|
+
is_oot = np.all(pd.isna(eval[1]))
|
3772
|
+
if not is_oot:
|
3773
|
+
if self.baseline_score_column not in eval[0].columns:
|
3774
|
+
raise ValidationError(self.bundle.get("baseline_score_column_not_exists"))
|
3775
|
+
if eval[0][self.baseline_score_column].isna().any():
|
3776
|
+
raise ValidationError(self.bundle.get("baseline_score_column_has_na"))
|
3534
3777
|
|
3535
3778
|
@staticmethod
|
3536
|
-
def _sample_X_and_y(X: pd.DataFrame, y: pd.Series, enriched_X: pd.DataFrame) ->
|
3779
|
+
def _sample_X_and_y(X: pd.DataFrame, y: pd.Series, enriched_X: pd.DataFrame) -> tuple[pd.DataFrame, pd.Series]:
|
3537
3780
|
Xy = pd.concat([X, y], axis=1)
|
3538
3781
|
Xy = pd.merge(Xy, enriched_X, left_index=True, right_index=True, how="inner", suffixes=("", "enriched"))
|
3539
3782
|
return Xy[X.columns].copy(), Xy[TARGET].copy()
|
@@ -3541,7 +3784,7 @@ if response.status_code == 200:
|
|
3541
3784
|
@staticmethod
|
3542
3785
|
def _sort_by_system_record_id(
|
3543
3786
|
X: pd.DataFrame, y: pd.Series, cv: Optional[CVType]
|
3544
|
-
) ->
|
3787
|
+
) -> tuple[pd.DataFrame, pd.Series]:
|
3545
3788
|
if cv not in [CVType.time_series, CVType.blocked_time_series]:
|
3546
3789
|
record_id_column = ENTITY_SYSTEM_RECORD_ID if ENTITY_SYSTEM_RECORD_ID in X else SYSTEM_RECORD_ID
|
3547
3790
|
Xy = X.copy()
|
@@ -3558,8 +3801,8 @@ if response.status_code == 200:
|
|
3558
3801
|
# Deprecated
|
3559
3802
|
@staticmethod
|
3560
3803
|
def _sort_by_keys(
|
3561
|
-
X: pd.DataFrame, y: pd.Series, search_keys:
|
3562
|
-
) ->
|
3804
|
+
X: pd.DataFrame, y: pd.Series, search_keys: dict[str, SearchKey], cv: Optional[CVType]
|
3805
|
+
) -> tuple[pd.DataFrame, pd.Series]:
|
3563
3806
|
if cv not in [CVType.time_series, CVType.blocked_time_series]:
|
3564
3807
|
if DateTimeSearchKeyConverter.DATETIME_COL in X.columns:
|
3565
3808
|
date_column = DateTimeSearchKeyConverter.DATETIME_COL
|
@@ -3599,12 +3842,10 @@ if response.status_code == 200:
|
|
3599
3842
|
self,
|
3600
3843
|
X: pd.DataFrame,
|
3601
3844
|
y: Union[pd.Series, np.ndarray, list, None] = None,
|
3602
|
-
eval_set: Optional[
|
3603
|
-
exclude_features_sources: Optional[
|
3845
|
+
eval_set: Optional[list[tuple]] = None,
|
3846
|
+
exclude_features_sources: Optional[list[str]] = None,
|
3604
3847
|
calculate_metrics: Optional[bool] = None,
|
3605
3848
|
cv: Optional[Any] = None,
|
3606
|
-
importance_threshold: Optional[Any] = None,
|
3607
|
-
max_features: Optional[Any] = None,
|
3608
3849
|
scoring: Optional[Any] = None,
|
3609
3850
|
estimator: Optional[Any] = None,
|
3610
3851
|
remove_outliers_calc_metrics: Optional[bool] = None,
|
@@ -3620,8 +3861,6 @@ if response.status_code == 200:
|
|
3620
3861
|
f"Runtime parameters: {self.runtime_parameters}\n"
|
3621
3862
|
f"Date format: {self.date_format}\n"
|
3622
3863
|
f"CV: {cv}\n"
|
3623
|
-
f"importance_threshold: {importance_threshold}\n"
|
3624
|
-
f"max_features: {max_features}\n"
|
3625
3864
|
f"Shared datasets: {self.shared_datasets}\n"
|
3626
3865
|
f"Random state: {self.random_state}\n"
|
3627
3866
|
f"Generate features: {self.generate_features}\n"
|
@@ -3685,7 +3924,7 @@ if response.status_code == 200:
|
|
3685
3924
|
except Exception:
|
3686
3925
|
self.logger.warning("Failed to log debug information", exc_info=True)
|
3687
3926
|
|
3688
|
-
def __handle_index_search_keys(self, df: pd.DataFrame, search_keys:
|
3927
|
+
def __handle_index_search_keys(self, df: pd.DataFrame, search_keys: dict[str, SearchKey]) -> pd.DataFrame:
|
3689
3928
|
index_names = df.index.names if df.index.names != [None] else [DEFAULT_INDEX]
|
3690
3929
|
index_search_keys = set(index_names).intersection(search_keys.keys())
|
3691
3930
|
if len(index_search_keys) > 0:
|
@@ -3704,7 +3943,7 @@ if response.status_code == 200:
|
|
3704
3943
|
return df
|
3705
3944
|
|
3706
3945
|
def _add_current_date_as_key(
|
3707
|
-
self, df: pd.DataFrame, search_keys:
|
3946
|
+
self, df: pd.DataFrame, search_keys: dict[str, SearchKey], bundle: ResourceBundle, silent: bool = False
|
3708
3947
|
) -> pd.DataFrame:
|
3709
3948
|
if (
|
3710
3949
|
set(search_keys.values()) == {SearchKey.PHONE}
|
@@ -3712,7 +3951,8 @@ if response.status_code == 200:
|
|
3712
3951
|
or set(search_keys.values()) == {SearchKey.HEM}
|
3713
3952
|
or set(search_keys.values()) == {SearchKey.COUNTRY, SearchKey.POSTAL_CODE}
|
3714
3953
|
):
|
3715
|
-
|
3954
|
+
if not silent:
|
3955
|
+
self.__log_warning(bundle.get("current_date_added"))
|
3716
3956
|
df[FeaturesEnricher.CURRENT_DATE] = datetime.date.today()
|
3717
3957
|
search_keys[FeaturesEnricher.CURRENT_DATE] = SearchKey.DATE
|
3718
3958
|
converter = DateTimeSearchKeyConverter(FeaturesEnricher.CURRENT_DATE, generate_cyclical_features=False)
|
@@ -3720,7 +3960,7 @@ if response.status_code == 200:
|
|
3720
3960
|
return df
|
3721
3961
|
|
3722
3962
|
@staticmethod
|
3723
|
-
def _get_group_columns(df: pd.DataFrame, search_keys:
|
3963
|
+
def _get_group_columns(df: pd.DataFrame, search_keys: dict[str, SearchKey]) -> list[str]:
|
3724
3964
|
search_key_priority = [SearchKey.PHONE, SearchKey.EMAIL, SearchKey.HEM, SearchKey.IP]
|
3725
3965
|
for key_type in search_key_priority:
|
3726
3966
|
if key_type in search_keys.values():
|
@@ -3733,7 +3973,7 @@ if response.status_code == 200:
|
|
3733
3973
|
]
|
3734
3974
|
|
3735
3975
|
@staticmethod
|
3736
|
-
def _get_email_column(search_keys:
|
3976
|
+
def _get_email_column(search_keys: dict[str, SearchKey]) -> Optional[str]:
|
3737
3977
|
cols = [col for col, t in search_keys.items() if t == SearchKey.EMAIL]
|
3738
3978
|
if len(cols) > 1:
|
3739
3979
|
raise Exception("More than one email column found after unnest")
|
@@ -3741,7 +3981,7 @@ if response.status_code == 200:
|
|
3741
3981
|
return cols[0]
|
3742
3982
|
|
3743
3983
|
@staticmethod
|
3744
|
-
def _get_hem_column(search_keys:
|
3984
|
+
def _get_hem_column(search_keys: dict[str, SearchKey]) -> Optional[str]:
|
3745
3985
|
cols = [col for col, t in search_keys.items() if t == SearchKey.HEM]
|
3746
3986
|
if len(cols) > 1:
|
3747
3987
|
raise Exception("More than one hem column found after unnest")
|
@@ -3749,7 +3989,7 @@ if response.status_code == 200:
|
|
3749
3989
|
return cols[0]
|
3750
3990
|
|
3751
3991
|
@staticmethod
|
3752
|
-
def _get_ip_column(search_keys:
|
3992
|
+
def _get_ip_column(search_keys: dict[str, SearchKey]) -> Optional[str]:
|
3753
3993
|
cols = [col for col, t in search_keys.items() if t == SearchKey.IP]
|
3754
3994
|
if len(cols) > 1:
|
3755
3995
|
raise Exception("More than one ip column found after unnest")
|
@@ -3757,32 +3997,32 @@ if response.status_code == 200:
|
|
3757
3997
|
return cols[0]
|
3758
3998
|
|
3759
3999
|
@staticmethod
|
3760
|
-
def _get_phone_column(search_keys:
|
4000
|
+
def _get_phone_column(search_keys: dict[str, SearchKey]) -> Optional[str]:
|
3761
4001
|
for col, t in search_keys.items():
|
3762
4002
|
if t == SearchKey.PHONE:
|
3763
4003
|
return col
|
3764
4004
|
|
3765
4005
|
@staticmethod
|
3766
|
-
def _get_country_column(search_keys:
|
4006
|
+
def _get_country_column(search_keys: dict[str, SearchKey]) -> Optional[str]:
|
3767
4007
|
for col, t in search_keys.items():
|
3768
4008
|
if t == SearchKey.COUNTRY:
|
3769
4009
|
return col
|
3770
4010
|
|
3771
4011
|
@staticmethod
|
3772
|
-
def _get_postal_column(search_keys:
|
4012
|
+
def _get_postal_column(search_keys: dict[str, SearchKey]) -> Optional[str]:
|
3773
4013
|
for col, t in search_keys.items():
|
3774
4014
|
if t == SearchKey.POSTAL_CODE:
|
3775
4015
|
return col
|
3776
4016
|
|
3777
4017
|
@staticmethod
|
3778
|
-
def _get_date_column(search_keys:
|
4018
|
+
def _get_date_column(search_keys: dict[str, SearchKey]) -> Optional[str]:
|
3779
4019
|
return SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
|
3780
4020
|
|
3781
4021
|
def _explode_multiple_search_keys(
|
3782
|
-
self, df: pd.DataFrame, search_keys:
|
3783
|
-
) ->
|
4022
|
+
self, df: pd.DataFrame, search_keys: dict[str, SearchKey], columns_renaming: dict[str, str]
|
4023
|
+
) -> tuple[pd.DataFrame, dict[str, list[str]]]:
|
3784
4024
|
# find groups of multiple search keys
|
3785
|
-
search_key_names_by_type:
|
4025
|
+
search_key_names_by_type: dict[SearchKey, list[str]] = {}
|
3786
4026
|
for key_name, key_type in search_keys.items():
|
3787
4027
|
search_key_names_by_type[key_type] = search_key_names_by_type.get(key_type, []) + [key_name]
|
3788
4028
|
search_key_names_by_type = {
|
@@ -3815,14 +4055,18 @@ if response.status_code == 200:
|
|
3815
4055
|
self.logger.info(f"Finished explosion. Size after: {len(df)}")
|
3816
4056
|
return df, unnest_search_keys
|
3817
4057
|
|
3818
|
-
|
3819
|
-
|
4058
|
+
@staticmethod
|
4059
|
+
def _add_fit_system_record_id(
|
3820
4060
|
df: pd.DataFrame,
|
3821
|
-
search_keys:
|
4061
|
+
search_keys: dict[str, SearchKey],
|
3822
4062
|
id_name: str,
|
3823
4063
|
target_name: str,
|
3824
|
-
columns_renaming:
|
3825
|
-
|
4064
|
+
columns_renaming: dict[str, str],
|
4065
|
+
id_columns: Optional[list[str]],
|
4066
|
+
cv: Optional[CVType],
|
4067
|
+
model_task_type: ModelTaskType,
|
4068
|
+
logger: Optional[logging.Logger] = None,
|
4069
|
+
bundle: ResourceBundle = bundle,
|
3826
4070
|
) -> pd.DataFrame:
|
3827
4071
|
original_index_name = df.index.name
|
3828
4072
|
index_name = df.index.name or DEFAULT_INDEX
|
@@ -3851,32 +4095,33 @@ if response.status_code == 200:
|
|
3851
4095
|
columns_to_sort = [date_column] if date_column is not None else []
|
3852
4096
|
|
3853
4097
|
do_sorting = True
|
3854
|
-
if
|
4098
|
+
if id_columns and cv is not None and cv.is_time_series():
|
3855
4099
|
# Check duplicates by date and id_columns
|
3856
4100
|
reversed_columns_renaming = {v: k for k, v in columns_renaming.items()}
|
3857
|
-
renamed_id_columns = [reversed_columns_renaming.get(c, c) for c in
|
4101
|
+
renamed_id_columns = [reversed_columns_renaming.get(c, c) for c in id_columns]
|
3858
4102
|
duplicate_check_columns = [c for c in renamed_id_columns if c in df.columns]
|
3859
4103
|
if date_column is not None:
|
3860
4104
|
duplicate_check_columns.append(date_column)
|
3861
4105
|
|
3862
4106
|
duplicates = df.duplicated(subset=duplicate_check_columns, keep=False)
|
3863
4107
|
if duplicates.any():
|
3864
|
-
raise ValueError(
|
4108
|
+
raise ValueError(bundle.get("date_and_id_columns_duplicates").format(duplicates.sum()))
|
3865
4109
|
else:
|
3866
4110
|
columns_to_hash = list(set(list(search_keys.keys()) + renamed_id_columns + [target_name]))
|
3867
4111
|
columns_to_hash = sort_columns(
|
3868
4112
|
df[columns_to_hash],
|
3869
4113
|
target_name,
|
3870
4114
|
search_keys,
|
3871
|
-
|
4115
|
+
model_task_type,
|
3872
4116
|
sort_exclude_columns,
|
3873
|
-
logger=
|
4117
|
+
logger=logger,
|
3874
4118
|
)
|
3875
4119
|
else:
|
3876
4120
|
columns_to_hash = sort_columns(
|
3877
|
-
df, target_name, search_keys,
|
4121
|
+
df, target_name, search_keys, model_task_type, sort_exclude_columns, logger=logger
|
3878
4122
|
)
|
3879
|
-
|
4123
|
+
|
4124
|
+
def sort_df(df: pd.DataFrame) -> pd.DataFrame:
|
3880
4125
|
search_keys_hash = "search_keys_hash"
|
3881
4126
|
if len(columns_to_hash) > 0:
|
3882
4127
|
factorized_df = df.copy()
|
@@ -3890,6 +4135,24 @@ if response.status_code == 200:
|
|
3890
4135
|
|
3891
4136
|
if search_keys_hash in df.columns:
|
3892
4137
|
df.drop(columns=search_keys_hash, inplace=True)
|
4138
|
+
return df
|
4139
|
+
|
4140
|
+
if do_sorting:
|
4141
|
+
sorted_dfs = []
|
4142
|
+
if EVAL_SET_INDEX in df.columns:
|
4143
|
+
# Sort train and eval sets separately
|
4144
|
+
train = df[df[EVAL_SET_INDEX] == 0].copy()
|
4145
|
+
sorted_dfs.append(sort_df(train))
|
4146
|
+
|
4147
|
+
for eval_set_index in df[EVAL_SET_INDEX].unique():
|
4148
|
+
if eval_set_index == 0:
|
4149
|
+
continue
|
4150
|
+
eval_set_df = df[df[EVAL_SET_INDEX] == eval_set_index].copy()
|
4151
|
+
sorted_dfs.append(sort_df(eval_set_df))
|
4152
|
+
|
4153
|
+
df = pd.concat(sorted_dfs)
|
4154
|
+
else:
|
4155
|
+
df = sort_df(df)
|
3893
4156
|
|
3894
4157
|
df = df.reset_index(drop=True).reset_index()
|
3895
4158
|
# system_record_id saves correct order for fit
|
@@ -3900,11 +4163,6 @@ if response.status_code == 200:
|
|
3900
4163
|
df.index.name = original_index_name
|
3901
4164
|
df = df.sort_values(by=original_order_name).drop(columns=original_order_name)
|
3902
4165
|
|
3903
|
-
# meaning_types[id_name] = (
|
3904
|
-
# FileColumnMeaningType.SYSTEM_RECORD_ID
|
3905
|
-
# if id_name == SYSTEM_RECORD_ID
|
3906
|
-
# else FileColumnMeaningType.ENTITY_SYSTEM_RECORD_ID
|
3907
|
-
# )
|
3908
4166
|
return df
|
3909
4167
|
|
3910
4168
|
def __correct_target(self, df: pd.DataFrame) -> pd.DataFrame:
|
@@ -3925,7 +4183,7 @@ if response.status_code == 200:
|
|
3925
4183
|
|
3926
4184
|
return df
|
3927
4185
|
|
3928
|
-
def __add_country_code(self, df: pd.DataFrame, search_keys:
|
4186
|
+
def __add_country_code(self, df: pd.DataFrame, search_keys: dict[str, SearchKey]) -> pd.DataFrame:
|
3929
4187
|
self.country_added = False
|
3930
4188
|
|
3931
4189
|
if self.country_code is not None and SearchKey.COUNTRY not in search_keys.values():
|
@@ -3951,6 +4209,7 @@ if response.status_code == 200:
|
|
3951
4209
|
self.logger.error(f"result features not found by search_task_id: {self.get_search_id()}")
|
3952
4210
|
raise RuntimeError(self.bundle.get("features_wasnt_returned"))
|
3953
4211
|
|
4212
|
+
result_features = result_features.copy()
|
3954
4213
|
if EVAL_SET_INDEX in result_features.columns:
|
3955
4214
|
result_features = result_features.drop(columns=EVAL_SET_INDEX)
|
3956
4215
|
|
@@ -3978,6 +4237,17 @@ if response.status_code == 200:
|
|
3978
4237
|
|
3979
4238
|
# TODO drop system_record_id before merge
|
3980
4239
|
# Merge with result features
|
4240
|
+
# Align dtypes for join key to avoid int/float merge warnings
|
4241
|
+
if ENTITY_SYSTEM_RECORD_ID in input_df.columns and ENTITY_SYSTEM_RECORD_ID in result_features.columns:
|
4242
|
+
input_is_float = pd.api.types.is_float_dtype(input_df[ENTITY_SYSTEM_RECORD_ID])
|
4243
|
+
result_is_float = pd.api.types.is_float_dtype(result_features[ENTITY_SYSTEM_RECORD_ID])
|
4244
|
+
if input_is_float or result_is_float:
|
4245
|
+
input_df[ENTITY_SYSTEM_RECORD_ID] = pd.to_numeric(
|
4246
|
+
input_df[ENTITY_SYSTEM_RECORD_ID], errors="coerce"
|
4247
|
+
).astype("float64")
|
4248
|
+
result_features[ENTITY_SYSTEM_RECORD_ID] = pd.to_numeric(
|
4249
|
+
result_features[ENTITY_SYSTEM_RECORD_ID], errors="coerce"
|
4250
|
+
).astype("float64")
|
3981
4251
|
result_features = pd.merge(
|
3982
4252
|
input_df,
|
3983
4253
|
result_features,
|
@@ -4039,7 +4309,7 @@ if response.status_code == 200:
|
|
4039
4309
|
|
4040
4310
|
return importances
|
4041
4311
|
|
4042
|
-
def __get_categorical_features(self) ->
|
4312
|
+
def __get_categorical_features(self) -> list[str]:
|
4043
4313
|
features_meta = self._search_task.get_all_features_metadata_v2()
|
4044
4314
|
if features_meta is None:
|
4045
4315
|
raise Exception(self.bundle.get("missing_features_meta"))
|
@@ -4047,10 +4317,16 @@ if response.status_code == 200:
|
|
4047
4317
|
return [f.name for f in features_meta if f.type == "categorical" and f.name not in (self.id_columns or [])]
|
4048
4318
|
|
4049
4319
|
def __prepare_feature_importances(
|
4050
|
-
self,
|
4320
|
+
self,
|
4321
|
+
trace_id: str,
|
4322
|
+
clients_features_df: pd.DataFrame,
|
4323
|
+
updated_shaps: Optional[dict[str, float]] = None,
|
4324
|
+
update_selected_features: bool = True,
|
4325
|
+
silent=False,
|
4051
4326
|
):
|
4052
4327
|
if self._search_task is None:
|
4053
4328
|
raise NotFittedError(self.bundle.get("transform_unfitted_enricher"))
|
4329
|
+
selected_features = self._search_task.get_selected_features(trace_id)
|
4054
4330
|
features_meta = self._search_task.get_all_features_metadata_v2()
|
4055
4331
|
if features_meta is None:
|
4056
4332
|
raise Exception(self.bundle.get("missing_features_meta"))
|
@@ -4060,11 +4336,10 @@ if response.status_code == 200:
|
|
4060
4336
|
features_df = self._search_task.get_all_initial_raw_features(trace_id, metrics_calculation=True)
|
4061
4337
|
|
4062
4338
|
# To be sure that names with hash suffixes
|
4063
|
-
|
4339
|
+
clients_features_df = clients_features_df.rename(columns=original_names_dict)
|
4064
4340
|
|
4065
4341
|
self.feature_names_ = []
|
4066
4342
|
self.external_source_feature_names = []
|
4067
|
-
self.zero_shap_client_features = []
|
4068
4343
|
self.feature_importances_ = []
|
4069
4344
|
features_info = []
|
4070
4345
|
features_info_without_links = []
|
@@ -4072,11 +4347,18 @@ if response.status_code == 200:
|
|
4072
4347
|
|
4073
4348
|
original_shaps = {original_names_dict.get(fm.name, fm.name): fm.shap_value for fm in features_meta}
|
4074
4349
|
|
4350
|
+
selected_features_meta = []
|
4075
4351
|
for feature_meta in features_meta:
|
4076
|
-
|
4077
|
-
|
4352
|
+
original_name = original_names_dict.get(feature_meta.name, feature_meta.name)
|
4353
|
+
feature_meta.name = original_name
|
4354
|
+
|
4355
|
+
is_client_feature = original_name in clients_features_df.columns
|
4356
|
+
|
4357
|
+
if selected_features is not None and feature_meta.name not in selected_features:
|
4358
|
+
self.logger.info(f"Feature {feature_meta.name} is not selected before and skipped")
|
4359
|
+
continue
|
4078
4360
|
|
4079
|
-
|
4361
|
+
selected_features_meta.append(feature_meta)
|
4080
4362
|
|
4081
4363
|
# Show and update shap values for client features only if select_features is True
|
4082
4364
|
if updated_shaps is not None and (not is_client_feature or self.fit_select_features):
|
@@ -4089,19 +4371,24 @@ if response.status_code == 200:
|
|
4089
4371
|
updating_shap = 0.0
|
4090
4372
|
feature_meta.shap_value = updating_shap
|
4091
4373
|
|
4092
|
-
|
4374
|
+
selected_features_meta.sort(key=lambda m: (-m.shap_value, m.name))
|
4093
4375
|
|
4094
|
-
for feature_meta in
|
4376
|
+
for feature_meta in selected_features_meta:
|
4095
4377
|
original_name = original_names_dict.get(feature_meta.name, feature_meta.name)
|
4096
|
-
is_client_feature = original_name in
|
4378
|
+
is_client_feature = original_name in clients_features_df.columns
|
4097
4379
|
|
4098
4380
|
if not is_client_feature:
|
4099
4381
|
self.external_source_feature_names.append(original_name)
|
4100
4382
|
|
4383
|
+
if self.psi_values is not None:
|
4384
|
+
if original_name in self.psi_values:
|
4385
|
+
feature_meta.psi_value = self.psi_values[original_name]
|
4386
|
+
else:
|
4387
|
+
continue
|
4388
|
+
|
4101
4389
|
# TODO make a decision about selected features based on special flag from mlb
|
4390
|
+
|
4102
4391
|
if original_shaps.get(feature_meta.name, 0.0) == 0.0:
|
4103
|
-
if is_client_feature and self.fit_select_features:
|
4104
|
-
self.zero_shap_client_features.append(original_name)
|
4105
4392
|
continue
|
4106
4393
|
|
4107
4394
|
# Use only important features
|
@@ -4122,14 +4409,19 @@ if response.status_code == 200:
|
|
4122
4409
|
self.feature_names_.append(feature_meta.name)
|
4123
4410
|
self.feature_importances_.append(_round_shap_value(feature_meta.shap_value))
|
4124
4411
|
|
4125
|
-
df_for_sample = features_df if feature_meta.name in features_df.columns else
|
4412
|
+
df_for_sample = features_df if feature_meta.name in features_df.columns else clients_features_df
|
4126
4413
|
feature_info = FeatureInfo.from_metadata(feature_meta, df_for_sample, is_client_feature)
|
4127
4414
|
features_info.append(feature_info.to_row(self.bundle))
|
4128
4415
|
features_info_without_links.append(feature_info.to_row_without_links(self.bundle))
|
4129
4416
|
internal_features_info.append(feature_info.to_internal_row(self.bundle))
|
4130
4417
|
|
4418
|
+
if update_selected_features:
|
4419
|
+
self._search_task.update_selected_features(trace_id, self.feature_names_)
|
4420
|
+
|
4131
4421
|
if len(features_info) > 0:
|
4132
4422
|
self.features_info = pd.DataFrame(features_info)
|
4423
|
+
if self.features_info[self.bundle.get("features_info_psi")].isna().all():
|
4424
|
+
self.features_info.drop(columns=[self.bundle.get("features_info_psi")], inplace=True)
|
4133
4425
|
self._features_info_without_links = pd.DataFrame(features_info_without_links)
|
4134
4426
|
self._internal_features_info = pd.DataFrame(internal_features_info)
|
4135
4427
|
if not silent:
|
@@ -4253,32 +4545,10 @@ if response.status_code == 200:
|
|
4253
4545
|
)
|
4254
4546
|
)
|
4255
4547
|
|
4256
|
-
def __filtered_importance_names(
|
4257
|
-
self, importance_threshold: Optional[float], max_features: Optional[int], trace_id: str, df: pd.DataFrame
|
4258
|
-
) -> List[str]:
|
4259
|
-
# get features importance from server
|
4260
|
-
filtered_importances = self.__get_features_importance_from_server(trace_id, df)
|
4261
|
-
|
4262
|
-
if len(filtered_importances) == 0:
|
4263
|
-
return []
|
4264
|
-
|
4265
|
-
if importance_threshold is not None:
|
4266
|
-
filtered_importances = [
|
4267
|
-
(name, importance)
|
4268
|
-
for name, importance in filtered_importances.items()
|
4269
|
-
if importance > importance_threshold
|
4270
|
-
]
|
4271
|
-
if max_features is not None:
|
4272
|
-
filtered_importances = list(filtered_importances)[:max_features]
|
4273
|
-
if len(filtered_importances) == 0:
|
4274
|
-
return []
|
4275
|
-
filtered_importance_names, _ = zip(*filtered_importances)
|
4276
|
-
return list(filtered_importance_names)
|
4277
|
-
|
4278
4548
|
def __prepare_search_keys(
|
4279
4549
|
self,
|
4280
4550
|
x: pd.DataFrame,
|
4281
|
-
search_keys:
|
4551
|
+
search_keys: dict[str, SearchKey],
|
4282
4552
|
is_demo_dataset: bool,
|
4283
4553
|
is_transform=False,
|
4284
4554
|
silent_mode=False,
|
@@ -4391,8 +4661,6 @@ if response.status_code == 200:
|
|
4391
4661
|
self,
|
4392
4662
|
scoring: Union[Callable, str, None],
|
4393
4663
|
estimator: Optional[Any],
|
4394
|
-
importance_threshold: Optional[float],
|
4395
|
-
max_features: Optional[int],
|
4396
4664
|
remove_outliers_calc_metrics: Optional[bool],
|
4397
4665
|
trace_id: str,
|
4398
4666
|
progress_bar: Optional[ProgressBar] = None,
|
@@ -4401,8 +4669,6 @@ if response.status_code == 200:
|
|
4401
4669
|
self.metrics = self.calculate_metrics(
|
4402
4670
|
scoring=scoring,
|
4403
4671
|
estimator=estimator,
|
4404
|
-
importance_threshold=importance_threshold,
|
4405
|
-
max_features=max_features,
|
4406
4672
|
remove_outliers_calc_metrics=remove_outliers_calc_metrics,
|
4407
4673
|
trace_id=trace_id,
|
4408
4674
|
internal_call=True,
|
@@ -4413,22 +4679,15 @@ if response.status_code == 200:
|
|
4413
4679
|
msg = self.bundle.get("quality_metrics_header")
|
4414
4680
|
display_html_dataframe(self.metrics, self.metrics, msg)
|
4415
4681
|
|
4416
|
-
def __show_selected_features(self
|
4417
|
-
search_key_names = [col for col, tpe in search_keys.items() if tpe != SearchKey.CUSTOM_KEY]
|
4418
|
-
if self.fit_columns_renaming:
|
4419
|
-
search_key_names = sorted(set([self.fit_columns_renaming.get(col, col) for col in search_key_names]))
|
4420
|
-
msg = self.bundle.get("features_info_header").format(len(self.feature_names_), search_key_names)
|
4421
|
-
|
4682
|
+
def __show_selected_features(self):
|
4422
4683
|
try:
|
4423
4684
|
_ = get_ipython() # type: ignore
|
4424
4685
|
|
4425
|
-
print(Format.GREEN + Format.BOLD + msg + Format.END)
|
4426
|
-
self.logger.info(msg)
|
4427
4686
|
if len(self.feature_names_) > 0:
|
4428
4687
|
self.features_info_display_handle = display_html_dataframe(
|
4429
4688
|
self.features_info,
|
4430
4689
|
self._features_info_without_links,
|
4431
|
-
self.bundle.get("relevant_features_header"),
|
4690
|
+
self.bundle.get("relevant_features_header").format(len(self.feature_names_)),
|
4432
4691
|
display_id=f"features_info_{uuid.uuid4()}",
|
4433
4692
|
)
|
4434
4693
|
|
@@ -4443,7 +4702,6 @@ if response.status_code == 200:
|
|
4443
4702
|
msg = self.bundle.get("features_info_zero_important_features")
|
4444
4703
|
self.__log_warning(msg, show_support_link=True)
|
4445
4704
|
except (ImportError, NameError):
|
4446
|
-
print(msg)
|
4447
4705
|
print(self._internal_features_info)
|
4448
4706
|
|
4449
4707
|
def __show_report_button(self, display_id: Optional[str] = None, display_handle=None):
|
@@ -4462,40 +4720,14 @@ if response.status_code == 200:
|
|
4462
4720
|
except Exception:
|
4463
4721
|
pass
|
4464
4722
|
|
4465
|
-
def __validate_importance_threshold(self, importance_threshold: Optional[float]) -> float:
|
4466
|
-
try:
|
4467
|
-
return float(importance_threshold) if importance_threshold is not None else 0.0
|
4468
|
-
except ValueError:
|
4469
|
-
self.logger.exception(f"Invalid importance_threshold provided: {importance_threshold}")
|
4470
|
-
raise ValidationError(self.bundle.get("invalid_importance_threshold"))
|
4471
|
-
|
4472
|
-
def __validate_max_features(self, max_features: Optional[int]) -> int:
|
4473
|
-
try:
|
4474
|
-
return int(max_features) if max_features is not None else 400
|
4475
|
-
except ValueError:
|
4476
|
-
self.logger.exception(f"Invalid max_features provided: {max_features}")
|
4477
|
-
raise ValidationError(self.bundle.get("invalid_max_features"))
|
4478
|
-
|
4479
|
-
def __filtered_enriched_features(
|
4480
|
-
self,
|
4481
|
-
importance_threshold: Optional[float],
|
4482
|
-
max_features: Optional[int],
|
4483
|
-
trace_id: str,
|
4484
|
-
df: pd.DataFrame,
|
4485
|
-
) -> List[str]:
|
4486
|
-
importance_threshold = self.__validate_importance_threshold(importance_threshold)
|
4487
|
-
max_features = self.__validate_max_features(max_features)
|
4488
|
-
|
4489
|
-
return self.__filtered_importance_names(importance_threshold, max_features, trace_id, df)
|
4490
|
-
|
4491
4723
|
def __detect_missing_search_keys(
|
4492
4724
|
self,
|
4493
4725
|
df: pd.DataFrame,
|
4494
|
-
search_keys:
|
4726
|
+
search_keys: dict[str, SearchKey],
|
4495
4727
|
is_demo_dataset: bool,
|
4496
4728
|
silent_mode=False,
|
4497
4729
|
is_transform=False,
|
4498
|
-
) ->
|
4730
|
+
) -> dict[str, SearchKey]:
|
4499
4731
|
sample = df.head(100)
|
4500
4732
|
|
4501
4733
|
def check_need_detect(search_key: SearchKey):
|
@@ -4641,7 +4873,7 @@ if response.status_code == 200:
|
|
4641
4873
|
trace_id: str,
|
4642
4874
|
X: Union[pd.DataFrame, pd.Series],
|
4643
4875
|
y: Union[pd.DataFrame, pd.Series, None] = None,
|
4644
|
-
eval_set: Union[
|
4876
|
+
eval_set: Union[tuple, None] = None,
|
4645
4877
|
):
|
4646
4878
|
def dump_task(X_, y_, eval_set_):
|
4647
4879
|
with MDC(trace_id=trace_id):
|
@@ -4651,7 +4883,7 @@ if response.status_code == 200:
|
|
4651
4883
|
|
4652
4884
|
with tempfile.TemporaryDirectory() as tmp_dir:
|
4653
4885
|
X_.to_parquet(f"{tmp_dir}/x.parquet", compression="zstd")
|
4654
|
-
x_digest_sha256 =
|
4886
|
+
x_digest_sha256 = file_hash(f"{tmp_dir}/x.parquet")
|
4655
4887
|
if self.rest_client.is_file_uploaded(trace_id, x_digest_sha256):
|
4656
4888
|
self.logger.info(
|
4657
4889
|
f"File x.parquet was already uploaded with digest {x_digest_sha256}, skipping"
|
@@ -4665,7 +4897,7 @@ if response.status_code == 200:
|
|
4665
4897
|
if isinstance(y_, pd.Series):
|
4666
4898
|
y_ = y_.to_frame()
|
4667
4899
|
y_.to_parquet(f"{tmp_dir}/y.parquet", compression="zstd")
|
4668
|
-
y_digest_sha256 =
|
4900
|
+
y_digest_sha256 = file_hash(f"{tmp_dir}/y.parquet")
|
4669
4901
|
if self.rest_client.is_file_uploaded(trace_id, y_digest_sha256):
|
4670
4902
|
self.logger.info(
|
4671
4903
|
f"File y.parquet was already uploaded with digest {y_digest_sha256}, skipping"
|
@@ -4680,9 +4912,7 @@ if response.status_code == 200:
|
|
4680
4912
|
if isinstance(eval_x_, pd.Series):
|
4681
4913
|
eval_x_ = eval_x_.to_frame()
|
4682
4914
|
eval_x_.to_parquet(f"{tmp_dir}/eval_x_{idx}.parquet", compression="zstd")
|
4683
|
-
eval_x_digest_sha256 =
|
4684
|
-
f"{tmp_dir}/eval_x_{idx}.parquet"
|
4685
|
-
)
|
4915
|
+
eval_x_digest_sha256 = file_hash(f"{tmp_dir}/eval_x_{idx}.parquet")
|
4686
4916
|
if self.rest_client.is_file_uploaded(trace_id, eval_x_digest_sha256):
|
4687
4917
|
self.logger.info(
|
4688
4918
|
f"File eval_x_{idx}.parquet was already uploaded with"
|
@@ -4699,9 +4929,7 @@ if response.status_code == 200:
|
|
4699
4929
|
if isinstance(eval_y_, pd.Series):
|
4700
4930
|
eval_y_ = eval_y_.to_frame()
|
4701
4931
|
eval_y_.to_parquet(f"{tmp_dir}/eval_y_{idx}.parquet", compression="zstd")
|
4702
|
-
eval_y_digest_sha256 =
|
4703
|
-
f"{tmp_dir}/eval_y_{idx}.parquet"
|
4704
|
-
)
|
4932
|
+
eval_y_digest_sha256 = file_hash(f"{tmp_dir}/eval_y_{idx}.parquet")
|
4705
4933
|
if self.rest_client.is_file_uploaded(trace_id, eval_y_digest_sha256):
|
4706
4934
|
self.logger.info(
|
4707
4935
|
f"File eval_y_{idx}.parquet was already uploaded"
|
@@ -4736,28 +4964,10 @@ def is_frames_equal(first, second, bundle: ResourceBundle) -> bool:
|
|
4736
4964
|
raise ValidationError(bundle.get("x_and_eval_x_diff_types").format(type(first), type(second)))
|
4737
4965
|
|
4738
4966
|
|
4739
|
-
def drop_duplicates(df: Union[pd.DataFrame, np.ndarray]) -> pd.DataFrame:
|
4967
|
+
def drop_duplicates(df: Union[pd.DataFrame, np.ndarray, Any]) -> pd.DataFrame:
|
4740
4968
|
if isinstance(df, pd.DataFrame):
|
4741
4969
|
return df.drop_duplicates()
|
4742
4970
|
elif isinstance(df, np.ndarray):
|
4743
4971
|
return pd.DataFrame(df).drop_duplicates()
|
4744
4972
|
else:
|
4745
4973
|
return df
|
4746
|
-
|
4747
|
-
|
4748
|
-
def hash_input(X: pd.DataFrame, y: Optional[pd.Series] = None, eval_set: Optional[List[Tuple]] = None) -> str:
|
4749
|
-
hashed_objects = []
|
4750
|
-
try:
|
4751
|
-
hashed_objects.append(pd.util.hash_pandas_object(X, index=False).values)
|
4752
|
-
if y is not None:
|
4753
|
-
hashed_objects.append(pd.util.hash_pandas_object(y, index=False).values)
|
4754
|
-
if eval_set is not None:
|
4755
|
-
if isinstance(eval_set, tuple):
|
4756
|
-
eval_set = [eval_set]
|
4757
|
-
for eval_X, eval_y in eval_set:
|
4758
|
-
hashed_objects.append(pd.util.hash_pandas_object(eval_X, index=False).values)
|
4759
|
-
hashed_objects.append(pd.util.hash_pandas_object(eval_y, index=False).values)
|
4760
|
-
common_hash = hashlib.sha256(np.concatenate(hashed_objects)).hexdigest()
|
4761
|
-
return common_hash
|
4762
|
-
except Exception:
|
4763
|
-
return ""
|