upgini 1.2.114a4__py3-none-any.whl → 1.2.115a1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- upgini/__about__.py +1 -1
- upgini/dataset.py +8 -72
- upgini/features_enricher.py +401 -578
- upgini/http.py +13 -35
- upgini/metadata.py +0 -10
- upgini/resource_bundle/strings.properties +1 -4
- upgini/search_task.py +6 -0
- upgini/utils/config.py +43 -0
- upgini/utils/display_utils.py +1 -1
- upgini/utils/hash_utils.py +23 -1
- upgini/utils/psi.py +6 -3
- upgini/utils/sample_utils.py +16 -41
- upgini/utils/target_utils.py +48 -2
- {upgini-1.2.114a4.dist-info → upgini-1.2.115a1.dist-info}/METADATA +32 -16
- {upgini-1.2.114a4.dist-info → upgini-1.2.115a1.dist-info}/RECORD +17 -16
- {upgini-1.2.114a4.dist-info → upgini-1.2.115a1.dist-info}/WHEEL +0 -0
- {upgini-1.2.114a4.dist-info → upgini-1.2.115a1.dist-info}/licenses/LICENSE +0 -0
upgini/features_enricher.py
CHANGED
@@ -1,8 +1,6 @@
|
|
1
1
|
import dataclasses
|
2
2
|
import datetime
|
3
3
|
import gc
|
4
|
-
import hashlib
|
5
|
-
import itertools
|
6
4
|
import json
|
7
5
|
import logging
|
8
6
|
import os
|
@@ -14,7 +12,7 @@ from collections import Counter
|
|
14
12
|
from copy import deepcopy
|
15
13
|
from dataclasses import dataclass
|
16
14
|
from threading import Thread
|
17
|
-
from typing import Any, Callable
|
15
|
+
from typing import Any, Callable
|
18
16
|
|
19
17
|
import numpy as np
|
20
18
|
import pandas as pd
|
@@ -101,7 +99,7 @@ from upgini.utils.email_utils import (
|
|
101
99
|
from upgini.utils.feature_info import FeatureInfo, _round_shap_value
|
102
100
|
from upgini.utils.features_validator import FeaturesValidator
|
103
101
|
from upgini.utils.format import Format
|
104
|
-
from upgini.utils.hash_utils import file_hash
|
102
|
+
from upgini.utils.hash_utils import file_hash, hash_input
|
105
103
|
from upgini.utils.ip_utils import IpSearchKeyConverter
|
106
104
|
from upgini.utils.phone_utils import PhoneSearchKeyDetector
|
107
105
|
from upgini.utils.postal_code_utils import PostalCodeSearchKeyDetector
|
@@ -113,10 +111,11 @@ except Exception:
|
|
113
111
|
CustomFallbackProgressBar as ProgressBar,
|
114
112
|
)
|
115
113
|
|
114
|
+
from upgini.utils.config import SampleConfig
|
116
115
|
from upgini.utils.psi import calculate_features_psi, calculate_sparsity_psi
|
117
|
-
from upgini.utils.sample_utils import SampleColumns,
|
116
|
+
from upgini.utils.sample_utils import SampleColumns, _num_samples, sample
|
118
117
|
from upgini.utils.sort import sort_columns
|
119
|
-
from upgini.utils.target_utils import calculate_psi, define_task
|
118
|
+
from upgini.utils.target_utils import calculate_psi, define_task, is_imbalanced
|
120
119
|
from upgini.utils.warning_counter import WarningCounter
|
121
120
|
from upgini.version_validator import validate_version
|
122
121
|
|
@@ -132,7 +131,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
132
131
|
Parameters
|
133
132
|
----------
|
134
133
|
search_keys: dict of str->SearchKey or int->SearchKey
|
135
|
-
|
134
|
+
dictionary with column names or indices mapping to key types.
|
136
135
|
Each of this columns will be used as a search key to find features.
|
137
136
|
|
138
137
|
country_code: str, optional (default=None)
|
@@ -164,7 +163,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
164
163
|
Custom loss function to use for feature selection and metrics calculation.
|
165
164
|
|
166
165
|
shared_datasets: list of str, optional (default=None)
|
167
|
-
|
166
|
+
list of private shared dataset ids for custom search
|
168
167
|
"""
|
169
168
|
|
170
169
|
TARGET_NAME = "target"
|
@@ -208,34 +207,34 @@ class FeaturesEnricher(TransformerMixin):
|
|
208
207
|
|
209
208
|
def __init__(
|
210
209
|
self,
|
211
|
-
search_keys:
|
212
|
-
country_code:
|
213
|
-
model_task_type:
|
214
|
-
api_key:
|
215
|
-
endpoint:
|
216
|
-
search_id:
|
217
|
-
shared_datasets:
|
218
|
-
runtime_parameters:
|
219
|
-
date_format:
|
210
|
+
search_keys: dict[str, SearchKey] | None = None,
|
211
|
+
country_code: str | None = None,
|
212
|
+
model_task_type: ModelTaskType | str | None = None,
|
213
|
+
api_key: str | None = None,
|
214
|
+
endpoint: str | None = None,
|
215
|
+
search_id: str | None = None,
|
216
|
+
shared_datasets: list[str] | None = None,
|
217
|
+
runtime_parameters: RuntimeParameters | None = None,
|
218
|
+
date_format: str | None = None,
|
220
219
|
random_state: int = 42,
|
221
|
-
cv:
|
222
|
-
loss:
|
220
|
+
cv: CVType | None = None,
|
221
|
+
loss: str | None = None,
|
223
222
|
autodetect_search_keys: bool = True,
|
224
|
-
generate_features:
|
225
|
-
columns_for_online_api:
|
226
|
-
round_embeddings:
|
223
|
+
generate_features: list[str] | None = None,
|
224
|
+
columns_for_online_api: list[str] | None = None,
|
225
|
+
round_embeddings: int | None = None,
|
227
226
|
logs_enabled: bool = True,
|
228
227
|
raise_validation_error: bool = True,
|
229
|
-
exclude_columns:
|
230
|
-
baseline_score_column:
|
231
|
-
client_ip:
|
232
|
-
client_visitorid:
|
233
|
-
custom_bundle_config:
|
228
|
+
exclude_columns: list[str] | None = None,
|
229
|
+
baseline_score_column: Any | None = None,
|
230
|
+
client_ip: str | None = None,
|
231
|
+
client_visitorid: str | None = None,
|
232
|
+
custom_bundle_config: str | None = None,
|
234
233
|
add_date_if_missing: bool = True,
|
235
234
|
disable_force_downsampling: bool = False,
|
236
|
-
id_columns:
|
235
|
+
id_columns: list[str] | None = None,
|
237
236
|
generate_search_key_features: bool = True,
|
238
|
-
sample_config:
|
237
|
+
sample_config: SampleConfig | None = None,
|
239
238
|
print_trace_id: bool = False,
|
240
239
|
**kwargs,
|
241
240
|
):
|
@@ -259,21 +258,21 @@ class FeaturesEnricher(TransformerMixin):
|
|
259
258
|
self.logger.warning(msg)
|
260
259
|
print(msg)
|
261
260
|
|
262
|
-
self.passed_features:
|
263
|
-
self.df_with_original_index:
|
264
|
-
self.fit_columns_renaming:
|
261
|
+
self.passed_features: list[str] = []
|
262
|
+
self.df_with_original_index: pd.DataFrame | None = None
|
263
|
+
self.fit_columns_renaming: dict[str, str] | None = None
|
265
264
|
self.country_added = False
|
266
|
-
self.fit_generated_features:
|
267
|
-
self.fit_dropped_features:
|
265
|
+
self.fit_generated_features: list[str] = []
|
266
|
+
self.fit_dropped_features: set[str] = set()
|
268
267
|
self.fit_search_keys = search_keys
|
269
268
|
self.warning_counter = WarningCounter()
|
270
|
-
self.X:
|
271
|
-
self.y:
|
272
|
-
self.eval_set:
|
273
|
-
self.autodetected_search_keys:
|
269
|
+
self.X: pd.DataFrame | None = None
|
270
|
+
self.y: pd.Series | None = None
|
271
|
+
self.eval_set: list[tuple] | None = None
|
272
|
+
self.autodetected_search_keys: dict[str, SearchKey] = {}
|
274
273
|
self.imbalanced = False
|
275
274
|
self.fit_select_features = True
|
276
|
-
self.__cached_sampled_datasets:
|
275
|
+
self.__cached_sampled_datasets: dict[str, tuple[pd.DataFrame, pd.DataFrame, pd.Series, dict, dict, dict]] = (
|
277
276
|
dict()
|
278
277
|
)
|
279
278
|
|
@@ -289,19 +288,17 @@ class FeaturesEnricher(TransformerMixin):
|
|
289
288
|
self.model_task_type = ModelTaskType.parse(model_task_type)
|
290
289
|
self.model_task_type = model_task_type
|
291
290
|
self.endpoint = endpoint
|
292
|
-
self._search_task:
|
291
|
+
self._search_task: SearchTask | None = None
|
293
292
|
self.features_info: pd.DataFrame = self.EMPTY_FEATURES_INFO
|
294
293
|
self._features_info_without_links: pd.DataFrame = self.EMPTY_FEATURES_INFO
|
295
294
|
self._internal_features_info: pd.DataFrame = self.EMPTY_INTERNAL_FEATURES_INFO
|
296
295
|
self.relevant_data_sources: pd.DataFrame = self.EMPTY_DATA_SOURCES
|
297
296
|
self._relevant_data_sources_wo_links: pd.DataFrame = self.EMPTY_DATA_SOURCES
|
298
|
-
self.metrics:
|
297
|
+
self.metrics: pd.DataFrame | None = None
|
299
298
|
self.feature_names_ = []
|
300
299
|
self.external_source_feature_names = []
|
301
|
-
self.zero_shap_client_features = []
|
302
|
-
self.unstable_client_features = []
|
303
300
|
self.feature_importances_ = []
|
304
|
-
self.psi_values:
|
301
|
+
self.psi_values: dict[str, float] | None = None
|
305
302
|
self.search_id = search_id
|
306
303
|
self.disable_force_downsampling = disable_force_downsampling
|
307
304
|
self.print_trace_id = print_trace_id
|
@@ -321,7 +318,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
321
318
|
x_columns = [c.name for c in file_metadata.columns]
|
322
319
|
self.fit_columns_renaming = {c.name: c.originalName for c in file_metadata.columns}
|
323
320
|
df = pd.DataFrame(columns=x_columns)
|
324
|
-
self.__prepare_feature_importances(trace_id, df, silent=True)
|
321
|
+
self.__prepare_feature_importances(trace_id, df, silent=True, update_selected_features=False)
|
322
|
+
self.__show_selected_features()
|
325
323
|
# TODO validate search_keys with search_keys from file_metadata
|
326
324
|
print(self.bundle.get("search_by_task_id_finish"))
|
327
325
|
self.logger.debug(f"Successfully initialized with search_id: {search_id}")
|
@@ -377,7 +375,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
377
375
|
self.autofe_features_display_handle = None
|
378
376
|
self.report_button_handle = None
|
379
377
|
|
380
|
-
def _get_sample_config(self, sample_config:
|
378
|
+
def _get_sample_config(self, sample_config: SampleConfig | None = None):
|
381
379
|
sample_config = sample_config or SampleConfig(force_sample_size=Dataset.FORCE_SAMPLE_SIZE)
|
382
380
|
|
383
381
|
maybe_downsampling_limit = self.runtime_parameters.properties.get("downsampling_limit")
|
@@ -432,22 +430,21 @@ class FeaturesEnricher(TransformerMixin):
|
|
432
430
|
|
433
431
|
def fit(
|
434
432
|
self,
|
435
|
-
X:
|
436
|
-
y:
|
437
|
-
eval_set:
|
433
|
+
X: pd.DataFrame | pd.Series | np.ndarray,
|
434
|
+
y: pd.Series | np.ndarray | list,
|
435
|
+
eval_set: list[tuple] | tuple | None = None,
|
438
436
|
*args,
|
439
|
-
exclude_features_sources:
|
440
|
-
calculate_metrics:
|
441
|
-
estimator:
|
442
|
-
scoring:
|
443
|
-
|
444
|
-
|
445
|
-
|
446
|
-
progress_callback: Optional[Callable[[SearchProgress], Any]] = None,
|
447
|
-
search_id_callback: Optional[Callable[[str], Any]] = None,
|
437
|
+
exclude_features_sources: list[str] | None = None,
|
438
|
+
calculate_metrics: bool | None = None,
|
439
|
+
estimator: Any | None = None,
|
440
|
+
scoring: Callable | str | None = None,
|
441
|
+
remove_outliers_calc_metrics: bool | None = None,
|
442
|
+
progress_callback: Callable[[SearchProgress], Any] | None = None,
|
443
|
+
search_id_callback: Callable[[str], Any] | None = None,
|
448
444
|
select_features: bool = True,
|
449
|
-
auto_fe_parameters:
|
450
|
-
stability_threshold: float = 0.
|
445
|
+
auto_fe_parameters: AutoFEParameters | None = None,
|
446
|
+
stability_threshold: float = 0.2,
|
447
|
+
stability_agg_func: str = "max",
|
451
448
|
**kwargs,
|
452
449
|
):
|
453
450
|
"""Fit to data.
|
@@ -462,14 +459,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
462
459
|
y: array-like of shape (n_samples,)
|
463
460
|
Target values.
|
464
461
|
|
465
|
-
eval_set:
|
466
|
-
|
467
|
-
|
468
|
-
importance_threshold: float, optional (default=None)
|
469
|
-
Minimum SHAP value to select a feature. Default value is 0.0.
|
470
|
-
|
471
|
-
max_features: int, optional (default=None)
|
472
|
-
Maximum number of most important features to select. If None, the number is unlimited.
|
462
|
+
eval_set: list[tuple], optional (default=None)
|
463
|
+
list of pairs (X, y) for validation.
|
473
464
|
|
474
465
|
calculate_metrics: bool, optional (default=None)
|
475
466
|
Whether to calculate and show metrics.
|
@@ -487,6 +478,13 @@ class FeaturesEnricher(TransformerMixin):
|
|
487
478
|
select_features: bool, optional (default=False)
|
488
479
|
If True, return only selected features both from input and data sources.
|
489
480
|
Otherwise, return all features from input and only selected features from data sources.
|
481
|
+
|
482
|
+
stability_threshold: float, optional (default=0.2)
|
483
|
+
Stability threshold for selected features PSI calculation. If PSI is less than this threshold,
|
484
|
+
then feature will be dropped.
|
485
|
+
|
486
|
+
stability_agg_func: str, optional (default="max")
|
487
|
+
Function to aggregate stability values. Can be "max", "min", "mean".
|
490
488
|
"""
|
491
489
|
trace_id = str(uuid.uuid4())
|
492
490
|
if self.print_trace_id:
|
@@ -536,9 +534,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
536
534
|
calculate_metrics=calculate_metrics,
|
537
535
|
estimator=estimator,
|
538
536
|
scoring=scoring,
|
539
|
-
importance_threshold=importance_threshold,
|
540
537
|
stability_threshold=stability_threshold,
|
541
|
-
|
538
|
+
stability_agg_func=stability_agg_func,
|
542
539
|
remove_outliers_calc_metrics=remove_outliers_calc_metrics,
|
543
540
|
auto_fe_parameters=auto_fe_parameters,
|
544
541
|
progress_callback=progress_callback,
|
@@ -582,28 +579,26 @@ class FeaturesEnricher(TransformerMixin):
|
|
582
579
|
|
583
580
|
def fit_transform(
|
584
581
|
self,
|
585
|
-
X:
|
586
|
-
y:
|
587
|
-
eval_set:
|
582
|
+
X: pd.DataFrame | pd.Series | np.ndarray,
|
583
|
+
y: pd.DataFrame | pd.Series | np.ndarray | list,
|
584
|
+
eval_set: list[tuple] | tuple | None = None,
|
588
585
|
*args,
|
589
|
-
exclude_features_sources:
|
590
|
-
keep_input: bool =
|
591
|
-
|
592
|
-
|
593
|
-
|
594
|
-
|
595
|
-
|
596
|
-
remove_outliers_calc_metrics: Optional[bool] = None,
|
597
|
-
progress_callback: Optional[Callable[[SearchProgress], Any]] = None,
|
586
|
+
exclude_features_sources: list[str] | None | None = None,
|
587
|
+
keep_input: bool | None = None,
|
588
|
+
calculate_metrics: bool | None = None,
|
589
|
+
scoring: Callable | str | None = None,
|
590
|
+
estimator: Any | None = None,
|
591
|
+
remove_outliers_calc_metrics: bool | None = None,
|
592
|
+
progress_callback: Callable[[SearchProgress], Any] | None = None,
|
598
593
|
select_features: bool = True,
|
599
|
-
auto_fe_parameters:
|
600
|
-
stability_threshold: float = 0.
|
594
|
+
auto_fe_parameters: AutoFEParameters | None = None,
|
595
|
+
stability_threshold: float = 0.2,
|
596
|
+
stability_agg_func: str = "max",
|
601
597
|
**kwargs,
|
602
598
|
) -> pd.DataFrame:
|
603
599
|
"""Fit to data, then transform it.
|
604
600
|
|
605
601
|
Fits transformer to `X` and `y` and returns a transformed version of `X`.
|
606
|
-
If keep_input is True, then all input columns are copied to the output dataframe.
|
607
602
|
|
608
603
|
Parameters
|
609
604
|
----------
|
@@ -613,20 +608,14 @@ class FeaturesEnricher(TransformerMixin):
|
|
613
608
|
y: array-like of shape (n_samples,)
|
614
609
|
Target values.
|
615
610
|
|
616
|
-
eval_set:
|
617
|
-
|
611
|
+
eval_set: list[tuple], optional (default=None)
|
612
|
+
list of pairs (X, y) for validation.
|
618
613
|
|
619
614
|
keep_input: bool, optional (default=True)
|
615
|
+
keep_input: bool, optional (default=None)
|
620
616
|
If True, copy original input columns to the output dataframe.
|
621
|
-
|
622
|
-
|
623
|
-
Minimum SHAP value to select a feature. Default value is 0.0.
|
624
|
-
|
625
|
-
max_features: int, optional (default=None)
|
626
|
-
Maximum number of most important features to select. If None, the number is unlimited.
|
627
|
-
|
628
|
-
calculate_metrics: bool, optional (default=None)
|
629
|
-
Whether to calculate and show metrics.
|
617
|
+
If False, then only enriched columns are returned.
|
618
|
+
If None, then all search keys, ID columns, selected client features and enriched columns will be returned.
|
630
619
|
|
631
620
|
estimator: sklearn-compatible estimator, optional (default=None)
|
632
621
|
Custom estimator for metrics calculation.
|
@@ -642,10 +631,13 @@ class FeaturesEnricher(TransformerMixin):
|
|
642
631
|
If True, return only selected features both from input and data sources.
|
643
632
|
Otherwise, return all features from input and only selected features from data sources.
|
644
633
|
|
645
|
-
stability_threshold: float, optional (default=0.
|
634
|
+
stability_threshold: float, optional (default=0.2)
|
646
635
|
Stability threshold for selected features PSI calculation. If PSI is less than this threshold,
|
647
636
|
then feature will be dropped.
|
648
637
|
|
638
|
+
stability_agg_func: str, optional (default="max")
|
639
|
+
Function to aggregate stability values. Can be "max", "min", "mean".
|
640
|
+
|
649
641
|
Returns
|
650
642
|
-------
|
651
643
|
X_new: pandas.DataFrame of shape (n_samples, n_features_new)
|
@@ -701,9 +693,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
701
693
|
calculate_metrics=calculate_metrics,
|
702
694
|
scoring=scoring,
|
703
695
|
estimator=estimator,
|
704
|
-
importance_threshold=importance_threshold,
|
705
696
|
stability_threshold=stability_threshold,
|
706
|
-
|
697
|
+
stability_agg_func=stability_agg_func,
|
707
698
|
remove_outliers_calc_metrics=remove_outliers_calc_metrics,
|
708
699
|
auto_fe_parameters=auto_fe_parameters,
|
709
700
|
progress_callback=progress_callback,
|
@@ -746,8 +737,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
746
737
|
X,
|
747
738
|
exclude_features_sources=exclude_features_sources,
|
748
739
|
keep_input=keep_input,
|
749
|
-
importance_threshold=importance_threshold,
|
750
|
-
max_features=max_features,
|
751
740
|
trace_id=trace_id,
|
752
741
|
silent_mode=True,
|
753
742
|
progress_bar=progress_bar,
|
@@ -760,36 +749,29 @@ class FeaturesEnricher(TransformerMixin):
|
|
760
749
|
self,
|
761
750
|
X: pd.DataFrame,
|
762
751
|
*args,
|
763
|
-
y:
|
764
|
-
exclude_features_sources:
|
765
|
-
keep_input: bool =
|
766
|
-
|
767
|
-
max_features: Optional[int] = None,
|
768
|
-
trace_id: Optional[str] = None,
|
752
|
+
y: pd.Series | None = None,
|
753
|
+
exclude_features_sources: list[str] | None = None,
|
754
|
+
keep_input: bool | None = None,
|
755
|
+
trace_id: str | None = None,
|
769
756
|
metrics_calculation: bool = False,
|
770
757
|
silent_mode=False,
|
771
|
-
progress_bar:
|
772
|
-
progress_callback:
|
758
|
+
progress_bar: ProgressBar | None = None,
|
759
|
+
progress_callback: Callable[[SearchProgress], Any] | None = None,
|
773
760
|
**kwargs,
|
774
761
|
) -> pd.DataFrame:
|
775
762
|
"""Transform `X`.
|
776
763
|
|
777
764
|
Returns a transformed version of `X`.
|
778
|
-
If keep_input is True, then all input columns are copied to the output dataframe.
|
779
765
|
|
780
766
|
Parameters
|
781
767
|
----------
|
782
768
|
X: pandas.DataFrame of shape (n_samples, n_features)
|
783
769
|
Input samples.
|
784
770
|
|
785
|
-
keep_input: bool, optional (default=
|
771
|
+
keep_input: bool, optional (default=None)
|
786
772
|
If True, copy original input columns to the output dataframe.
|
787
|
-
|
788
|
-
|
789
|
-
Minimum SHAP value to select a feature. Default value is 0.0.
|
790
|
-
|
791
|
-
max_features: int, optional (default=None)
|
792
|
-
Maximum number of most important features to select. If None, the number is unlimited.
|
773
|
+
If False, then only enriched columns are returned.
|
774
|
+
If None, then all search keys, ID columns, selected client features and enriched columns will be returned.
|
793
775
|
|
794
776
|
Returns
|
795
777
|
-------
|
@@ -827,11 +809,10 @@ class FeaturesEnricher(TransformerMixin):
|
|
827
809
|
X,
|
828
810
|
y=y,
|
829
811
|
exclude_features_sources=exclude_features_sources,
|
830
|
-
importance_threshold=importance_threshold,
|
831
|
-
max_features=max_features,
|
832
812
|
metrics_calculation=metrics_calculation,
|
833
813
|
silent_mode=silent_mode,
|
834
814
|
progress_bar=progress_bar,
|
815
|
+
keep_input=keep_input,
|
835
816
|
)
|
836
817
|
self.logger.info("Transform finished successfully")
|
837
818
|
search_progress = SearchProgress(100.0, ProgressStage.FINISHED)
|
@@ -873,32 +854,26 @@ class FeaturesEnricher(TransformerMixin):
|
|
873
854
|
raise e
|
874
855
|
finally:
|
875
856
|
self.logger.info(f"Transform elapsed time: {time.time() - start_time}")
|
876
|
-
|
877
|
-
|
878
|
-
if keep_input:
|
879
|
-
return result
|
880
|
-
else:
|
881
|
-
return result.drop(columns=X.columns, errors="ignore")
|
857
|
+
|
858
|
+
return result
|
882
859
|
|
883
860
|
def calculate_metrics(
|
884
861
|
self,
|
885
|
-
X:
|
886
|
-
y:
|
887
|
-
eval_set:
|
862
|
+
X: pd.DataFrame | pd.Series | np.ndarray | None = None,
|
863
|
+
y: pd.DataFrame | pd.Series | np.ndarray | list | None = None,
|
864
|
+
eval_set: list[tuple] | tuple | None = None,
|
888
865
|
*args,
|
889
|
-
scoring:
|
890
|
-
cv:
|
866
|
+
scoring: Callable | str | None = None,
|
867
|
+
cv: BaseCrossValidator | CVType | str | None = None,
|
891
868
|
estimator=None,
|
892
|
-
exclude_features_sources:
|
893
|
-
|
894
|
-
|
895
|
-
remove_outliers_calc_metrics: Optional[bool] = None,
|
896
|
-
trace_id: Optional[str] = None,
|
869
|
+
exclude_features_sources: list[str] | None = None,
|
870
|
+
remove_outliers_calc_metrics: bool | None = None,
|
871
|
+
trace_id: str | None = None,
|
897
872
|
internal_call: bool = False,
|
898
|
-
progress_bar:
|
899
|
-
progress_callback:
|
873
|
+
progress_bar: ProgressBar | None = None,
|
874
|
+
progress_callback: Callable[[SearchProgress], Any] | None = None,
|
900
875
|
**kwargs,
|
901
|
-
) ->
|
876
|
+
) -> pd.DataFrame | None:
|
902
877
|
"""Calculate metrics
|
903
878
|
|
904
879
|
Parameters
|
@@ -909,8 +884,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
909
884
|
y: array-like of shape (n_samples,), optional (default=None)
|
910
885
|
Target values. If X not passed then y from fit will be used
|
911
886
|
|
912
|
-
eval_set:
|
913
|
-
|
887
|
+
eval_set: list[tuple], optional (default=None)
|
888
|
+
list of pairs (X, y) for validation. If X not passed then eval_set from fit will be used
|
914
889
|
|
915
890
|
scoring: string or callable, optional (default=None)
|
916
891
|
A string or a scorer callable object / function with signature scorer(estimator, X, y).
|
@@ -922,12 +897,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
922
897
|
estimator: sklearn-compatible estimator, optional (default=None)
|
923
898
|
Custom estimator for metrics calculation. If not passed then CatBoost will be used.
|
924
899
|
|
925
|
-
importance_threshold: float, optional (default=None)
|
926
|
-
Minimum SHAP value to select a feature. Default value is 0.0.
|
927
|
-
|
928
|
-
max_features: int, optional (default=None)
|
929
|
-
Maximum number of most important features to select. If None, the number is unlimited.
|
930
|
-
|
931
900
|
remove_outliers_calc_metrics, optional (default=True)
|
932
901
|
If True then rows with target ouliers will be dropped on metrics calculation
|
933
902
|
|
@@ -990,8 +959,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
990
959
|
validated_eval_set,
|
991
960
|
exclude_features_sources=exclude_features_sources,
|
992
961
|
cv=cv if cv is not None else self.cv,
|
993
|
-
importance_threshold=importance_threshold,
|
994
|
-
max_features=max_features,
|
995
962
|
scoring=scoring,
|
996
963
|
estimator=estimator,
|
997
964
|
remove_outliers_calc_metrics=remove_outliers_calc_metrics,
|
@@ -1032,20 +999,19 @@ class FeaturesEnricher(TransformerMixin):
|
|
1032
999
|
search_keys_for_metrics.extend([c for c in self.id_columns or [] if c not in search_keys_for_metrics])
|
1033
1000
|
self.logger.info(f"Search keys for metrics: {search_keys_for_metrics}")
|
1034
1001
|
|
1035
|
-
prepared_data = self.
|
1002
|
+
prepared_data = self._get_cached_enriched_data(
|
1036
1003
|
trace_id=trace_id,
|
1037
1004
|
X=X,
|
1038
1005
|
y=y,
|
1039
1006
|
eval_set=eval_set,
|
1040
1007
|
exclude_features_sources=exclude_features_sources,
|
1041
|
-
importance_threshold=importance_threshold,
|
1042
|
-
max_features=max_features,
|
1043
1008
|
remove_outliers_calc_metrics=remove_outliers_calc_metrics,
|
1044
1009
|
cv_override=cv,
|
1045
1010
|
search_keys_for_metrics=search_keys_for_metrics,
|
1046
1011
|
progress_bar=progress_bar,
|
1047
1012
|
progress_callback=progress_callback,
|
1048
1013
|
client_cat_features=client_cat_features,
|
1014
|
+
is_for_metrics=True,
|
1049
1015
|
)
|
1050
1016
|
if prepared_data is None:
|
1051
1017
|
return None
|
@@ -1345,17 +1311,16 @@ class FeaturesEnricher(TransformerMixin):
|
|
1345
1311
|
def _select_features_by_psi(
|
1346
1312
|
self,
|
1347
1313
|
trace_id: str,
|
1348
|
-
X:
|
1349
|
-
y:
|
1350
|
-
eval_set:
|
1314
|
+
X: pd.DataFrame | pd.Series | np.ndarray,
|
1315
|
+
y: pd.DataFrame | pd.Series | np.ndarray | list,
|
1316
|
+
eval_set: list[tuple] | tuple | None,
|
1351
1317
|
stability_threshold: float,
|
1352
|
-
|
1318
|
+
stability_agg_func: Callable,
|
1319
|
+
cv: BaseCrossValidator | CVType | str | None = None,
|
1353
1320
|
estimator=None,
|
1354
|
-
exclude_features_sources:
|
1355
|
-
importance_threshold: Optional[float] = None,
|
1356
|
-
max_features: Optional[int] = None,
|
1321
|
+
exclude_features_sources: list[str] | None = None,
|
1357
1322
|
progress_bar: bool = True,
|
1358
|
-
progress_callback:
|
1323
|
+
progress_callback: Callable | None = None,
|
1359
1324
|
):
|
1360
1325
|
search_keys = self.search_keys.copy()
|
1361
1326
|
validated_X, _, validated_eval_set = self._validate_train_eval(X, y, eval_set, silent=True)
|
@@ -1392,14 +1357,12 @@ class FeaturesEnricher(TransformerMixin):
|
|
1392
1357
|
c for c in client_cat_features if c not in self.id_columns_encoder.feature_names_in_
|
1393
1358
|
]
|
1394
1359
|
|
1395
|
-
prepared_data = self.
|
1360
|
+
prepared_data = self._get_cached_enriched_data(
|
1396
1361
|
trace_id=trace_id,
|
1397
1362
|
X=X,
|
1398
1363
|
y=y,
|
1399
1364
|
eval_set=eval_set,
|
1400
1365
|
exclude_features_sources=exclude_features_sources,
|
1401
|
-
importance_threshold=importance_threshold,
|
1402
|
-
max_features=max_features,
|
1403
1366
|
remove_outliers_calc_metrics=False,
|
1404
1367
|
cv_override=cv,
|
1405
1368
|
search_keys_for_metrics=search_keys_for_metrics,
|
@@ -1412,15 +1375,15 @@ class FeaturesEnricher(TransformerMixin):
|
|
1412
1375
|
|
1413
1376
|
(
|
1414
1377
|
validated_X,
|
1415
|
-
|
1378
|
+
_,
|
1416
1379
|
y_sorted,
|
1417
|
-
|
1380
|
+
_,
|
1418
1381
|
_,
|
1419
1382
|
fitting_eval_set_dict,
|
1420
1383
|
_,
|
1421
1384
|
_,
|
1422
1385
|
_,
|
1423
|
-
|
1386
|
+
_,
|
1424
1387
|
eval_set_dates,
|
1425
1388
|
) = prepared_data
|
1426
1389
|
|
@@ -1435,43 +1398,28 @@ class FeaturesEnricher(TransformerMixin):
|
|
1435
1398
|
eval_set_dates,
|
1436
1399
|
search_keys,
|
1437
1400
|
stability_threshold,
|
1401
|
+
stability_agg_func,
|
1438
1402
|
cat_features,
|
1439
1403
|
model_task_type,
|
1440
1404
|
)
|
1441
|
-
client_features_df = self.df_with_original_index.rename(columns=columns_renaming)
|
1442
|
-
# decoded_X = self._decode_id_columns(fitting_X, columns_renaming)
|
1443
|
-
self._update_report_psi(trace_id, client_features_df)
|
1444
1405
|
|
1445
1406
|
if unstable_features:
|
1446
|
-
msg = f"
|
1407
|
+
msg = f"{len(unstable_features)} feature(s) are unstable: {unstable_features} and will be dropped"
|
1447
1408
|
self.logger.warning(msg)
|
1448
1409
|
print(msg)
|
1449
|
-
fitting_X = fitting_X.drop(columns=unstable_features, errors="ignore")
|
1450
|
-
fitting_enriched_X = fitting_enriched_X.drop(columns=unstable_features, errors="ignore")
|
1451
|
-
msg = f"Threre are {len(fitting_enriched_X.columns)} stable selected features left"
|
1452
|
-
self.logger.info(msg)
|
1453
|
-
print(msg)
|
1454
|
-
for idx, (
|
1455
|
-
eval_X,
|
1456
|
-
eval_y,
|
1457
|
-
eval_enriched_X,
|
1458
|
-
eval_enriched_y,
|
1459
|
-
) in fitting_eval_set_dict.items():
|
1460
|
-
eval_X = eval_X.drop(columns=unstable_features, errors="ignore")
|
1461
|
-
eval_enriched_X = eval_enriched_X.drop(columns=unstable_features, errors="ignore")
|
1462
|
-
fitting_eval_set_dict[idx] = (eval_X, eval_y, eval_enriched_X, eval_enriched_y)
|
1463
1410
|
|
1464
1411
|
def _check_stability(
|
1465
1412
|
self,
|
1466
1413
|
X: pd.DataFrame,
|
1467
|
-
eval_set:
|
1468
|
-
enriched_eval_set:
|
1469
|
-
eval_set_dates:
|
1470
|
-
search_keys:
|
1414
|
+
eval_set: list[tuple[pd.DataFrame, pd.Series]],
|
1415
|
+
enriched_eval_set: dict,
|
1416
|
+
eval_set_dates: dict[int, pd.Series],
|
1417
|
+
search_keys: dict[str, SearchKey],
|
1471
1418
|
stability_threshold: float,
|
1472
|
-
|
1419
|
+
stability_agg_func: str | None,
|
1420
|
+
cat_features: list[str],
|
1473
1421
|
model_task_type: ModelTaskType,
|
1474
|
-
) ->
|
1422
|
+
) -> list[str]:
|
1475
1423
|
# Find latest eval set or earliest if all eval sets are before train set
|
1476
1424
|
date_column = self._get_date_column(search_keys)
|
1477
1425
|
|
@@ -1521,17 +1469,17 @@ class FeaturesEnricher(TransformerMixin):
|
|
1521
1469
|
|
1522
1470
|
unstable_by_sparsity = [feature for feature, psi in psi_values_sparse.items() if psi > stability_threshold]
|
1523
1471
|
if unstable_by_sparsity:
|
1524
|
-
self.logger.info(f"Unstable by sparsity features: {sorted(unstable_by_sparsity)}")
|
1472
|
+
self.logger.info(f"Unstable by sparsity features ({stability_threshold}): {sorted(unstable_by_sparsity)}")
|
1525
1473
|
|
1526
1474
|
psi_values = calculate_features_psi(
|
1527
|
-
checking_eval_set_df, cat_features, date_column, self.logger, model_task_type
|
1475
|
+
checking_eval_set_df, cat_features, date_column, self.logger, model_task_type, stability_agg_func
|
1528
1476
|
)
|
1529
1477
|
|
1530
1478
|
self.logger.info(f"PSI values by value: {psi_values}")
|
1531
1479
|
|
1532
1480
|
unstable_by_value = [feature for feature, psi in psi_values.items() if psi > stability_threshold]
|
1533
1481
|
if unstable_by_value:
|
1534
|
-
self.logger.info(f"Unstable by value features: {sorted(unstable_by_value)}")
|
1482
|
+
self.logger.info(f"Unstable by value features ({stability_threshold}): {sorted(unstable_by_value)}")
|
1535
1483
|
|
1536
1484
|
self.psi_values = {
|
1537
1485
|
feature: psi_value for feature, psi_value in psi_values.items() if psi_value <= stability_threshold
|
@@ -1541,7 +1489,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
1541
1489
|
|
1542
1490
|
return total_unstable_features
|
1543
1491
|
|
1544
|
-
def _update_shap_values(self, trace_id: str, df: pd.DataFrame, new_shaps:
|
1492
|
+
def _update_shap_values(self, trace_id: str, df: pd.DataFrame, new_shaps: dict[str, float], silent: bool = False):
|
1545
1493
|
renaming = self.fit_columns_renaming or {}
|
1546
1494
|
self.logger.info(f"Updating SHAP values: {new_shaps}")
|
1547
1495
|
new_shaps = {
|
@@ -1558,7 +1506,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
1558
1506
|
display_html_dataframe(
|
1559
1507
|
self.features_info,
|
1560
1508
|
self._features_info_without_links,
|
1561
|
-
self.bundle.get("relevant_features_header"),
|
1509
|
+
self.bundle.get("relevant_features_header").format(len(self.feature_names_)),
|
1562
1510
|
display_handle=self.features_info_display_handle,
|
1563
1511
|
)
|
1564
1512
|
except (ImportError, NameError):
|
@@ -1596,56 +1544,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
1596
1544
|
except (ImportError, NameError):
|
1597
1545
|
pass
|
1598
1546
|
|
1599
|
-
def _update_report_psi(self, trace_id: str, clients_features_df: pd.DataFrame):
|
1600
|
-
self.__prepare_feature_importances(trace_id, clients_features_df)
|
1601
|
-
|
1602
|
-
if self.features_info_display_handle is not None:
|
1603
|
-
try:
|
1604
|
-
_ = get_ipython() # type: ignore
|
1605
|
-
|
1606
|
-
display_html_dataframe(
|
1607
|
-
self.features_info,
|
1608
|
-
self._features_info_without_links,
|
1609
|
-
self.bundle.get("relevant_features_header"),
|
1610
|
-
display_handle=self.features_info_display_handle,
|
1611
|
-
)
|
1612
|
-
except (ImportError, NameError):
|
1613
|
-
pass
|
1614
|
-
|
1615
|
-
if self.data_sources_display_handle is not None:
|
1616
|
-
try:
|
1617
|
-
_ = get_ipython() # type: ignore
|
1618
|
-
|
1619
|
-
display_html_dataframe(
|
1620
|
-
self.relevant_data_sources,
|
1621
|
-
self._relevant_data_sources_wo_links,
|
1622
|
-
self.bundle.get("relevant_data_sources_header"),
|
1623
|
-
display_handle=self.data_sources_display_handle,
|
1624
|
-
)
|
1625
|
-
except (ImportError, NameError):
|
1626
|
-
pass
|
1627
|
-
|
1628
|
-
if self.autofe_features_display_handle is not None:
|
1629
|
-
try:
|
1630
|
-
_ = get_ipython() # type: ignore
|
1631
|
-
autofe_descriptions_df = self.get_autofe_features_description()
|
1632
|
-
if autofe_descriptions_df is not None:
|
1633
|
-
display_html_dataframe(
|
1634
|
-
df=autofe_descriptions_df,
|
1635
|
-
internal_df=autofe_descriptions_df,
|
1636
|
-
header=self.bundle.get("autofe_descriptions_header"),
|
1637
|
-
display_handle=self.autofe_features_display_handle,
|
1638
|
-
)
|
1639
|
-
except (ImportError, NameError):
|
1640
|
-
pass
|
1641
|
-
if self.report_button_handle is not None:
|
1642
|
-
try:
|
1643
|
-
_ = get_ipython() # type: ignore
|
1644
|
-
|
1645
|
-
self.__show_report_button(display_handle=self.report_button_handle)
|
1646
|
-
except (ImportError, NameError):
|
1647
|
-
pass
|
1648
|
-
|
1649
1547
|
def _check_train_and_eval_target_distribution(self, y, eval_set_dict):
|
1650
1548
|
uneven_distribution = False
|
1651
1549
|
for eval_set in eval_set_dict.values():
|
@@ -1659,13 +1557,13 @@ class FeaturesEnricher(TransformerMixin):
|
|
1659
1557
|
self.logger.warning(msg)
|
1660
1558
|
|
1661
1559
|
def _has_features_with_commercial_schema(
|
1662
|
-
self, commercial_schema: str, exclude_features_sources:
|
1560
|
+
self, commercial_schema: str, exclude_features_sources: list[str] | None
|
1663
1561
|
) -> bool:
|
1664
1562
|
return len(self._get_features_with_commercial_schema(commercial_schema, exclude_features_sources)) > 0
|
1665
1563
|
|
1666
1564
|
def _get_features_with_commercial_schema(
|
1667
|
-
self, commercial_schema: str, exclude_features_sources:
|
1668
|
-
) ->
|
1565
|
+
self, commercial_schema: str, exclude_features_sources: list[str] | None
|
1566
|
+
) -> list[str]:
|
1669
1567
|
if exclude_features_sources:
|
1670
1568
|
filtered_features_info = self._internal_features_info[
|
1671
1569
|
~self._internal_features_info[self.bundle.get("features_info_name")].isin(exclude_features_sources)
|
@@ -1679,15 +1577,15 @@ class FeaturesEnricher(TransformerMixin):
|
|
1679
1577
|
].values
|
1680
1578
|
)
|
1681
1579
|
|
1682
|
-
def _has_paid_features(self, exclude_features_sources:
|
1580
|
+
def _has_paid_features(self, exclude_features_sources: list[str] | None) -> bool:
|
1683
1581
|
return self._has_features_with_commercial_schema(CommercialSchema.PAID.value, exclude_features_sources)
|
1684
1582
|
|
1685
1583
|
def _is_input_same_as_fit(
|
1686
1584
|
self,
|
1687
|
-
X:
|
1688
|
-
y:
|
1689
|
-
eval_set:
|
1690
|
-
) ->
|
1585
|
+
X: pd.DataFrame | pd.Series | np.ndarray | None = None,
|
1586
|
+
y: pd.DataFrame | pd.Series | np.ndarray | list | None = None,
|
1587
|
+
eval_set: list[tuple] | None = None,
|
1588
|
+
) -> tuple:
|
1691
1589
|
if X is None:
|
1692
1590
|
return True, self.X, self.y, self.eval_set
|
1693
1591
|
|
@@ -1717,9 +1615,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
1717
1615
|
def _get_cv_and_groups(
|
1718
1616
|
self,
|
1719
1617
|
X: pd.DataFrame,
|
1720
|
-
cv_override:
|
1721
|
-
search_keys:
|
1722
|
-
) ->
|
1618
|
+
cv_override: BaseCrossValidator | CVType | str | None,
|
1619
|
+
search_keys: dict[str, SearchKey],
|
1620
|
+
) -> tuple[BaseCrossValidator, np.ndarray] | None:
|
1723
1621
|
_cv = cv_override or self.cv
|
1724
1622
|
group_columns = sorted(self._get_group_columns(X, search_keys))
|
1725
1623
|
groups = None
|
@@ -1747,8 +1645,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
1747
1645
|
return _cv, groups
|
1748
1646
|
|
1749
1647
|
def _get_and_validate_client_cat_features(
|
1750
|
-
self, estimator:
|
1751
|
-
) ->
|
1648
|
+
self, estimator: Any | None, X: pd.DataFrame, search_keys: dict[str, SearchKey]
|
1649
|
+
) -> tuple[list[str] | None, list[str]]:
|
1752
1650
|
cat_features = []
|
1753
1651
|
search_keys_for_metrics = []
|
1754
1652
|
if (
|
@@ -1777,41 +1675,41 @@ class FeaturesEnricher(TransformerMixin):
|
|
1777
1675
|
raise ValidationError(self.bundle.get("cat_feature_search_key").format(cat_feature))
|
1778
1676
|
return cat_features, search_keys_for_metrics
|
1779
1677
|
|
1780
|
-
def
|
1678
|
+
def _get_cached_enriched_data(
|
1781
1679
|
self,
|
1782
1680
|
trace_id: str,
|
1783
|
-
X:
|
1784
|
-
y:
|
1785
|
-
eval_set:
|
1786
|
-
exclude_features_sources:
|
1787
|
-
|
1788
|
-
|
1789
|
-
|
1790
|
-
|
1791
|
-
|
1792
|
-
|
1793
|
-
|
1794
|
-
client_cat_features: Optional[List[str]] = None,
|
1681
|
+
X: pd.DataFrame | pd.Series | np.ndarray | None = None,
|
1682
|
+
y: pd.DataFrame | pd.Series | np.ndarray | list | None = None,
|
1683
|
+
eval_set: list[tuple] | tuple | None = None,
|
1684
|
+
exclude_features_sources: list[str] | None = None,
|
1685
|
+
remove_outliers_calc_metrics: bool | None = None,
|
1686
|
+
cv_override: BaseCrossValidator | CVType | str | None = None,
|
1687
|
+
search_keys_for_metrics: list[str] | None = None,
|
1688
|
+
progress_bar: ProgressBar | None = None,
|
1689
|
+
progress_callback: Callable[[SearchProgress], Any] | None = None,
|
1690
|
+
client_cat_features: list[str] | None = None,
|
1691
|
+
is_for_metrics: bool = False,
|
1795
1692
|
):
|
1796
1693
|
is_input_same_as_fit, X, y, eval_set = self._is_input_same_as_fit(X, y, eval_set)
|
1797
1694
|
is_demo_dataset = hash_input(X, y, eval_set) in DEMO_DATASET_HASHES
|
1798
1695
|
checked_eval_set = self._check_eval_set(eval_set, X)
|
1799
1696
|
validated_X, validated_y, validated_eval_set = self._validate_train_eval(X, y, checked_eval_set, silent=True)
|
1800
1697
|
|
1801
|
-
sampled_data = self.
|
1802
|
-
trace_id,
|
1803
|
-
validated_X,
|
1804
|
-
validated_y,
|
1805
|
-
validated_eval_set,
|
1806
|
-
exclude_features_sources,
|
1807
|
-
is_input_same_as_fit,
|
1808
|
-
is_demo_dataset,
|
1809
|
-
remove_outliers_calc_metrics,
|
1810
|
-
progress_bar,
|
1811
|
-
progress_callback,
|
1698
|
+
sampled_data = self._get_enriched_datasets(
|
1699
|
+
trace_id=trace_id,
|
1700
|
+
validated_X=validated_X,
|
1701
|
+
validated_y=validated_y,
|
1702
|
+
eval_set=validated_eval_set,
|
1703
|
+
exclude_features_sources=exclude_features_sources,
|
1704
|
+
is_input_same_as_fit=is_input_same_as_fit,
|
1705
|
+
is_demo_dataset=is_demo_dataset,
|
1706
|
+
remove_outliers_calc_metrics=remove_outliers_calc_metrics,
|
1707
|
+
progress_bar=progress_bar,
|
1708
|
+
progress_callback=progress_callback,
|
1709
|
+
is_for_metrics=is_for_metrics,
|
1812
1710
|
)
|
1813
|
-
(X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys, columns_renaming) =
|
1814
|
-
sampled_data
|
1711
|
+
(X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys, columns_renaming, generated_features) = (
|
1712
|
+
dataclasses.astuple(sampled_data)
|
1815
1713
|
)
|
1816
1714
|
|
1817
1715
|
excluding_search_keys = list(search_keys.keys())
|
@@ -1827,14 +1725,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
1827
1725
|
|
1828
1726
|
client_features = [
|
1829
1727
|
c
|
1830
|
-
for c in
|
1831
|
-
if (
|
1832
|
-
|
1833
|
-
or c in set(self.feature_names_).union(self.id_columns or [])
|
1834
|
-
or (self.fit_columns_renaming or {}).get(c, c) in set(self.feature_names_).union(self.id_columns or [])
|
1835
|
-
)
|
1836
|
-
and c
|
1837
|
-
not in (
|
1728
|
+
for c in (validated_X.columns.to_list() + generated_features)
|
1729
|
+
if (not self.fit_select_features or c in set(self.feature_names_).union(self.id_columns or []))
|
1730
|
+
and c not in (
|
1838
1731
|
excluding_search_keys
|
1839
1732
|
+ list(self.fit_dropped_features)
|
1840
1733
|
+ [DateTimeSearchKeyConverter.DATETIME_COL, SYSTEM_RECORD_ID, ENTITY_SYSTEM_RECORD_ID]
|
@@ -1842,20 +1735,17 @@ class FeaturesEnricher(TransformerMixin):
|
|
1842
1735
|
]
|
1843
1736
|
self.logger.info(f"Client features column on prepare data for metrics: {client_features}")
|
1844
1737
|
|
1845
|
-
|
1846
|
-
importance_threshold, max_features, trace_id, validated_X
|
1847
|
-
)
|
1848
|
-
filtered_enriched_features = [c for c in filtered_enriched_features if c not in client_features]
|
1738
|
+
selected_enriched_features = [c for c in self.feature_names_ if c not in client_features]
|
1849
1739
|
|
1850
1740
|
X_sorted, y_sorted = self._sort_by_system_record_id(X_sampled, y_sampled, self.cv)
|
1851
1741
|
enriched_X_sorted, enriched_y_sorted = self._sort_by_system_record_id(enriched_X, y_sampled, self.cv)
|
1852
1742
|
|
1853
1743
|
cv, groups = self._get_cv_and_groups(enriched_X_sorted, cv_override, search_keys)
|
1854
1744
|
|
1855
|
-
|
1745
|
+
existing_selected_enriched_features = [c for c in selected_enriched_features if c in enriched_X_sorted.columns]
|
1856
1746
|
|
1857
1747
|
fitting_X = X_sorted[client_features].copy()
|
1858
|
-
fitting_enriched_X = enriched_X_sorted[client_features +
|
1748
|
+
fitting_enriched_X = enriched_X_sorted[client_features + existing_selected_enriched_features].copy()
|
1859
1749
|
|
1860
1750
|
renamed_generate_features = [columns_renaming.get(c, c) for c in (self.generate_features or [])]
|
1861
1751
|
renamed_client_cat_features = [columns_renaming.get(c, c) for c in (client_cat_features or [])]
|
@@ -1995,29 +1885,31 @@ class FeaturesEnricher(TransformerMixin):
|
|
1995
1885
|
X_sampled: pd.DataFrame
|
1996
1886
|
y_sampled: pd.Series
|
1997
1887
|
enriched_X: pd.DataFrame
|
1998
|
-
eval_set_sampled_dict:
|
1999
|
-
search_keys:
|
2000
|
-
columns_renaming:
|
1888
|
+
eval_set_sampled_dict: dict[int, tuple[pd.DataFrame, pd.Series]]
|
1889
|
+
search_keys: dict[str, SearchKey]
|
1890
|
+
columns_renaming: dict[str, str]
|
1891
|
+
generated_features: list[str]
|
2001
1892
|
|
2002
|
-
def
|
1893
|
+
def _get_enriched_datasets(
|
2003
1894
|
self,
|
2004
1895
|
trace_id: str,
|
2005
|
-
validated_X:
|
2006
|
-
validated_y:
|
2007
|
-
eval_set:
|
2008
|
-
exclude_features_sources:
|
1896
|
+
validated_X: pd.DataFrame | pd.Series | np.ndarray | None,
|
1897
|
+
validated_y: pd.DataFrame | pd.Series | np.ndarray | list | None,
|
1898
|
+
eval_set: list[tuple] | None,
|
1899
|
+
exclude_features_sources: list[str] | None,
|
2009
1900
|
is_input_same_as_fit: bool,
|
2010
1901
|
is_demo_dataset: bool,
|
2011
|
-
remove_outliers_calc_metrics:
|
2012
|
-
progress_bar:
|
2013
|
-
progress_callback:
|
1902
|
+
remove_outliers_calc_metrics: bool | None,
|
1903
|
+
progress_bar: ProgressBar | None,
|
1904
|
+
progress_callback: Callable[[SearchProgress], Any] | None,
|
1905
|
+
is_for_metrics: bool = False,
|
2014
1906
|
) -> _EnrichedDataForMetrics:
|
2015
1907
|
datasets_hash = hash_input(validated_X, validated_y, eval_set)
|
2016
1908
|
cached_sampled_datasets = self.__cached_sampled_datasets.get(datasets_hash)
|
2017
1909
|
if cached_sampled_datasets is not None and is_input_same_as_fit and remove_outliers_calc_metrics is None:
|
2018
1910
|
self.logger.info("Cached enriched dataset found - use it")
|
2019
1911
|
return self.__get_sampled_cached_enriched(datasets_hash, exclude_features_sources)
|
2020
|
-
elif len(self.
|
1912
|
+
elif len(self.feature_names_) == 0 or all([f in validated_X.columns for f in self.feature_names_]):
|
2021
1913
|
self.logger.info("No external features selected. So use only input datasets for metrics calculation")
|
2022
1914
|
return self.__get_enriched_as_input(validated_X, validated_y, eval_set, is_demo_dataset)
|
2023
1915
|
# TODO save and check if dataset was deduplicated - use imbalance branch for such case
|
@@ -2043,12 +1935,13 @@ class FeaturesEnricher(TransformerMixin):
|
|
2043
1935
|
trace_id,
|
2044
1936
|
progress_bar,
|
2045
1937
|
progress_callback,
|
1938
|
+
is_for_metrics=is_for_metrics,
|
2046
1939
|
)
|
2047
1940
|
|
2048
1941
|
def __get_sampled_cached_enriched(
|
2049
|
-
self, datasets_hash: str, exclude_features_sources:
|
1942
|
+
self, datasets_hash: str, exclude_features_sources: list[str] | None
|
2050
1943
|
) -> _EnrichedDataForMetrics:
|
2051
|
-
X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys, columns_renaming = (
|
1944
|
+
X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys, columns_renaming, generated_features = (
|
2052
1945
|
self.__cached_sampled_datasets[datasets_hash]
|
2053
1946
|
)
|
2054
1947
|
if exclude_features_sources:
|
@@ -2062,10 +1955,11 @@ class FeaturesEnricher(TransformerMixin):
|
|
2062
1955
|
eval_set_sampled_dict,
|
2063
1956
|
columns_renaming,
|
2064
1957
|
search_keys,
|
1958
|
+
generated_features,
|
2065
1959
|
)
|
2066
1960
|
|
2067
1961
|
def __get_enriched_as_input(
|
2068
|
-
self, validated_X: pd.DataFrame, validated_y: pd.Series, eval_set:
|
1962
|
+
self, validated_X: pd.DataFrame, validated_y: pd.Series, eval_set: list[tuple] | None, is_demo_dataset: bool
|
2069
1963
|
) -> _EnrichedDataForMetrics:
|
2070
1964
|
eval_set_sampled_dict = {}
|
2071
1965
|
|
@@ -2130,6 +2024,10 @@ class FeaturesEnricher(TransformerMixin):
|
|
2130
2024
|
if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
|
2131
2025
|
df = df.drop(columns=DateTimeSearchKeyConverter.DATETIME_COL)
|
2132
2026
|
|
2027
|
+
df = df.rename(columns=columns_renaming)
|
2028
|
+
generated_features = [columns_renaming.get(c, c) for c in generated_features]
|
2029
|
+
search_keys = {columns_renaming.get(k, k): v for k, v in search_keys.items()}
|
2030
|
+
|
2133
2031
|
train_df = df.query(f"{EVAL_SET_INDEX} == 0") if eval_set is not None else df
|
2134
2032
|
X_sampled = train_df.drop(columns=[TARGET, EVAL_SET_INDEX], errors="ignore")
|
2135
2033
|
y_sampled = train_df[TARGET].copy()
|
@@ -2152,13 +2050,14 @@ class FeaturesEnricher(TransformerMixin):
|
|
2152
2050
|
eval_set_sampled_dict,
|
2153
2051
|
columns_renaming,
|
2154
2052
|
search_keys,
|
2053
|
+
generated_features,
|
2155
2054
|
)
|
2156
2055
|
|
2157
2056
|
def __get_enriched_from_fit(
|
2158
2057
|
self,
|
2159
|
-
eval_set:
|
2058
|
+
eval_set: list[tuple] | None,
|
2160
2059
|
trace_id: str,
|
2161
|
-
remove_outliers_calc_metrics:
|
2060
|
+
remove_outliers_calc_metrics: bool | None,
|
2162
2061
|
) -> _EnrichedDataForMetrics:
|
2163
2062
|
eval_set_sampled_dict = {}
|
2164
2063
|
search_keys = self.fit_search_keys.copy()
|
@@ -2246,6 +2145,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
2246
2145
|
eval_X_sampled.rename(columns=self.fit_columns_renaming, inplace=True)
|
2247
2146
|
enriched_eval_X.rename(columns=self.fit_columns_renaming, inplace=True)
|
2248
2147
|
search_keys = {self.fit_columns_renaming.get(k, k): v for k, v in search_keys.items()}
|
2148
|
+
generated_features = [self.fit_columns_renaming.get(c, c) for c in self.fit_generated_features]
|
2249
2149
|
|
2250
2150
|
datasets_hash = hash_input(self.X, self.y, self.eval_set)
|
2251
2151
|
return self.__cache_and_return_results(
|
@@ -2256,17 +2156,19 @@ class FeaturesEnricher(TransformerMixin):
|
|
2256
2156
|
eval_set_sampled_dict,
|
2257
2157
|
self.fit_columns_renaming,
|
2258
2158
|
search_keys,
|
2159
|
+
generated_features,
|
2259
2160
|
)
|
2260
2161
|
|
2261
2162
|
def __get_enriched_from_transform(
|
2262
2163
|
self,
|
2263
2164
|
validated_X: pd.DataFrame,
|
2264
2165
|
validated_y: pd.Series,
|
2265
|
-
eval_set:
|
2266
|
-
exclude_features_sources:
|
2166
|
+
eval_set: list[tuple] | None,
|
2167
|
+
exclude_features_sources: list[str] | None,
|
2267
2168
|
trace_id: str,
|
2268
|
-
progress_bar:
|
2269
|
-
progress_callback:
|
2169
|
+
progress_bar: ProgressBar | None,
|
2170
|
+
progress_callback: Callable[[SearchProgress], Any] | None,
|
2171
|
+
is_for_metrics: bool = False,
|
2270
2172
|
) -> _EnrichedDataForMetrics:
|
2271
2173
|
has_eval_set = eval_set is not None
|
2272
2174
|
|
@@ -2274,6 +2176,16 @@ class FeaturesEnricher(TransformerMixin):
|
|
2274
2176
|
|
2275
2177
|
# Prepare
|
2276
2178
|
df = self.__combine_train_and_eval_sets(validated_X, validated_y, eval_set)
|
2179
|
+
|
2180
|
+
# Exclude OOT eval sets from transform because they are not used for metrics calculation
|
2181
|
+
if not is_for_metrics and EVAL_SET_INDEX in df.columns:
|
2182
|
+
for eval_index in df[EVAL_SET_INDEX].unique():
|
2183
|
+
if eval_index == 0:
|
2184
|
+
continue
|
2185
|
+
eval_df = df.query(f"{EVAL_SET_INDEX} == {eval_index}")
|
2186
|
+
if eval_df[TARGET].isna().all():
|
2187
|
+
df = df.query(f"{EVAL_SET_INDEX} != {eval_index}")
|
2188
|
+
|
2277
2189
|
df, _ = clean_full_duplicates(df, logger=self.logger, bundle=self.bundle)
|
2278
2190
|
df = self.__downsample_for_metrics(df)
|
2279
2191
|
|
@@ -2315,10 +2227,11 @@ class FeaturesEnricher(TransformerMixin):
|
|
2315
2227
|
eval_set_sampled_dict,
|
2316
2228
|
columns_renaming,
|
2317
2229
|
search_keys,
|
2230
|
+
generated_features,
|
2318
2231
|
)
|
2319
2232
|
|
2320
2233
|
def __combine_train_and_eval_sets(
|
2321
|
-
self, X: pd.DataFrame, y:
|
2234
|
+
self, X: pd.DataFrame, y: pd.Series | None = None, eval_set: list[tuple] | None = None
|
2322
2235
|
) -> pd.DataFrame:
|
2323
2236
|
df = X.copy()
|
2324
2237
|
if y is not None:
|
@@ -2370,8 +2283,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
2370
2283
|
)
|
2371
2284
|
|
2372
2285
|
def __extract_train_data(
|
2373
|
-
self, enriched_df: pd.DataFrame, x_columns:
|
2374
|
-
) ->
|
2286
|
+
self, enriched_df: pd.DataFrame, x_columns: list[str]
|
2287
|
+
) -> tuple[pd.DataFrame, pd.Series, pd.DataFrame]:
|
2375
2288
|
if EVAL_SET_INDEX in enriched_df.columns:
|
2376
2289
|
enriched_Xy = enriched_df.query(f"{EVAL_SET_INDEX} == 0")
|
2377
2290
|
else:
|
@@ -2382,8 +2295,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
2382
2295
|
return X_sampled, y_sampled, enriched_X
|
2383
2296
|
|
2384
2297
|
def __extract_eval_data(
|
2385
|
-
self, enriched_df: pd.DataFrame, x_columns:
|
2386
|
-
) ->
|
2298
|
+
self, enriched_df: pd.DataFrame, x_columns: list[str], enriched_X_columns: list[str], eval_set_len: int
|
2299
|
+
) -> tuple[dict[int, tuple], dict[int, pd.Series]]:
|
2387
2300
|
eval_set_sampled_dict = {}
|
2388
2301
|
|
2389
2302
|
for idx in range(eval_set_len):
|
@@ -2401,9 +2314,10 @@ class FeaturesEnricher(TransformerMixin):
|
|
2401
2314
|
X_sampled: pd.DataFrame,
|
2402
2315
|
y_sampled: pd.Series,
|
2403
2316
|
enriched_X: pd.DataFrame,
|
2404
|
-
eval_set_sampled_dict:
|
2405
|
-
columns_renaming:
|
2406
|
-
search_keys:
|
2317
|
+
eval_set_sampled_dict: dict[int, tuple],
|
2318
|
+
columns_renaming: dict[str, str],
|
2319
|
+
search_keys: dict[str, SearchKey],
|
2320
|
+
generated_features: list[str],
|
2407
2321
|
) -> _EnrichedDataForMetrics:
|
2408
2322
|
|
2409
2323
|
self.__cached_sampled_datasets[datasets_hash] = (
|
@@ -2413,10 +2327,11 @@ class FeaturesEnricher(TransformerMixin):
|
|
2413
2327
|
eval_set_sampled_dict,
|
2414
2328
|
search_keys,
|
2415
2329
|
columns_renaming,
|
2330
|
+
generated_features,
|
2416
2331
|
)
|
2417
2332
|
|
2418
2333
|
return self.__mk_sampled_data_tuple(
|
2419
|
-
X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys, columns_renaming
|
2334
|
+
X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys, columns_renaming, generated_features
|
2420
2335
|
)
|
2421
2336
|
|
2422
2337
|
def __mk_sampled_data_tuple(
|
@@ -2424,17 +2339,11 @@ class FeaturesEnricher(TransformerMixin):
|
|
2424
2339
|
X_sampled: pd.DataFrame,
|
2425
2340
|
y_sampled: pd.Series,
|
2426
2341
|
enriched_X: pd.DataFrame,
|
2427
|
-
eval_set_sampled_dict:
|
2428
|
-
search_keys:
|
2429
|
-
columns_renaming:
|
2342
|
+
eval_set_sampled_dict: dict,
|
2343
|
+
search_keys: dict,
|
2344
|
+
columns_renaming: dict[str, str],
|
2345
|
+
generated_features: list[str],
|
2430
2346
|
):
|
2431
|
-
# X_sampled - with hash-suffixes
|
2432
|
-
# reversed_renaming = {v: k for k, v in columns_renaming.items()}
|
2433
|
-
# search_keys = {
|
2434
|
-
# reversed_renaming.get(k, k): v
|
2435
|
-
# for k, v in search_keys.items()
|
2436
|
-
# if reversed_renaming.get(k, k) in X_sampled.columns.to_list()
|
2437
|
-
# }
|
2438
2347
|
return FeaturesEnricher._EnrichedDataForMetrics(
|
2439
2348
|
X_sampled=X_sampled,
|
2440
2349
|
y_sampled=y_sampled,
|
@@ -2442,9 +2351,10 @@ class FeaturesEnricher(TransformerMixin):
|
|
2442
2351
|
eval_set_sampled_dict=eval_set_sampled_dict,
|
2443
2352
|
search_keys=search_keys,
|
2444
2353
|
columns_renaming=columns_renaming,
|
2354
|
+
generated_features=generated_features,
|
2445
2355
|
)
|
2446
2356
|
|
2447
|
-
def get_search_id(self) ->
|
2357
|
+
def get_search_id(self) -> str | None:
|
2448
2358
|
"""Returns search_id of the fitted enricher. Not available before a successful fit."""
|
2449
2359
|
return self._search_task.search_task_id if self._search_task else None
|
2450
2360
|
|
@@ -2457,7 +2367,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
2457
2367
|
|
2458
2368
|
return self.features_info
|
2459
2369
|
|
2460
|
-
def get_progress(self, trace_id:
|
2370
|
+
def get_progress(self, trace_id: str | None = None, search_task: SearchTask | None = None) -> SearchProgress:
|
2461
2371
|
search_task = search_task or self._search_task
|
2462
2372
|
if search_task is not None:
|
2463
2373
|
trace_id = trace_id or uuid.uuid4()
|
@@ -2565,16 +2475,15 @@ if response.status_code == 200:
|
|
2565
2475
|
trace_id: str,
|
2566
2476
|
X: pd.DataFrame,
|
2567
2477
|
*,
|
2568
|
-
y:
|
2569
|
-
exclude_features_sources:
|
2570
|
-
importance_threshold: Optional[float] = None,
|
2571
|
-
max_features: Optional[int] = None,
|
2478
|
+
y: pd.Series | None = None,
|
2479
|
+
exclude_features_sources: list[str] | None = None,
|
2572
2480
|
metrics_calculation: bool = False,
|
2573
2481
|
silent_mode: bool = False,
|
2574
|
-
progress_bar:
|
2575
|
-
progress_callback:
|
2482
|
+
progress_bar: ProgressBar | None = None,
|
2483
|
+
progress_callback: Callable[[SearchProgress], Any] | None = None,
|
2576
2484
|
add_fit_system_record_id: bool = False,
|
2577
|
-
|
2485
|
+
keep_input: bool | None = None,
|
2486
|
+
) -> tuple[pd.DataFrame, dict[str, str], list[str], dict[str, SearchKey]]:
|
2578
2487
|
if self._search_task is None:
|
2579
2488
|
raise NotFittedError(self.bundle.get("transform_unfitted_enricher"))
|
2580
2489
|
|
@@ -2592,11 +2501,8 @@ if response.status_code == 200:
|
|
2592
2501
|
|
2593
2502
|
self.__log_debug_information(validated_X, validated_y, exclude_features_sources=exclude_features_sources)
|
2594
2503
|
|
2595
|
-
filtered_columns = self.__filtered_enriched_features(
|
2596
|
-
importance_threshold, max_features, trace_id, validated_X
|
2597
|
-
)
|
2598
2504
|
# If there are no important features, return original dataframe
|
2599
|
-
if
|
2505
|
+
if len(self.feature_names_) == 0:
|
2600
2506
|
msg = self.bundle.get("no_important_features_for_transform")
|
2601
2507
|
self.__log_warning(msg, show_support_link=True)
|
2602
2508
|
return X, {c: c for c in X.columns}, [], dict()
|
@@ -2703,9 +2609,9 @@ if response.status_code == 200:
|
|
2703
2609
|
if not external_features:
|
2704
2610
|
self.logger.warning(
|
2705
2611
|
"No external features found, returning original dataframe"
|
2706
|
-
f" with generated important features: {
|
2612
|
+
f" with generated important features: {self.feature_names_}"
|
2707
2613
|
)
|
2708
|
-
filtered_columns = [c for c in
|
2614
|
+
filtered_columns = [c for c in self.feature_names_ if c in df.columns]
|
2709
2615
|
self.logger.warning(f"Filtered columns by existance in dataframe: {filtered_columns}")
|
2710
2616
|
return df[filtered_columns], columns_renaming, generated_features, search_keys
|
2711
2617
|
|
@@ -2843,16 +2749,6 @@ if response.status_code == 200:
|
|
2843
2749
|
)
|
2844
2750
|
dataset.columns_renaming = columns_renaming
|
2845
2751
|
|
2846
|
-
if max_features is not None or importance_threshold is not None:
|
2847
|
-
exclude_features_sources = list(
|
2848
|
-
set(
|
2849
|
-
(exclude_features_sources or [])
|
2850
|
-
+ self._get_excluded_features(max_features, importance_threshold)
|
2851
|
-
)
|
2852
|
-
)
|
2853
|
-
if len(exclude_features_sources) == 0:
|
2854
|
-
exclude_features_sources = None
|
2855
|
-
|
2856
2752
|
validation_task = self._search_task.validation(
|
2857
2753
|
trace_id,
|
2858
2754
|
dataset,
|
@@ -2880,7 +2776,7 @@ if response.status_code == 200:
|
|
2880
2776
|
progress_bar.progress = progress.to_progress_bar()
|
2881
2777
|
if progress_callback is not None:
|
2882
2778
|
progress_callback(progress)
|
2883
|
-
prev_progress:
|
2779
|
+
prev_progress: SearchProgress | None = None
|
2884
2780
|
polling_period_seconds = 1
|
2885
2781
|
try:
|
2886
2782
|
while progress.stage != ProgressStage.DOWNLOADING.value:
|
@@ -2917,6 +2813,8 @@ if response.status_code == 200:
|
|
2917
2813
|
print(self.bundle.get("transform_start"))
|
2918
2814
|
|
2919
2815
|
# Prepare input DataFrame for __enrich by concatenating generated ids and client features
|
2816
|
+
df_before_explode = df_before_explode.rename(columns=columns_renaming)
|
2817
|
+
generated_features = [columns_renaming.get(c, c) for c in generated_features]
|
2920
2818
|
combined_df = pd.concat(
|
2921
2819
|
[
|
2922
2820
|
validated_Xy.reset_index(drop=True),
|
@@ -2934,15 +2832,28 @@ if response.status_code == 200:
|
|
2934
2832
|
)
|
2935
2833
|
|
2936
2834
|
selected_generated_features = [
|
2937
|
-
c for c in generated_features if not self.fit_select_features or c in
|
2938
|
-
]
|
2939
|
-
selecting_columns = [
|
2940
|
-
c
|
2941
|
-
for c in itertools.chain(validated_Xy.columns.tolist(), selected_generated_features)
|
2942
|
-
if (c not in self.zero_shap_client_features and c not in self.unstable_client_features)
|
2943
|
-
or c in (self.id_columns or [])
|
2835
|
+
c for c in generated_features if not self.fit_select_features or c in self.feature_names_
|
2944
2836
|
]
|
2945
|
-
|
2837
|
+
if keep_input is None:
|
2838
|
+
selected_input_columns = [
|
2839
|
+
c
|
2840
|
+
for c in validated_Xy.columns
|
2841
|
+
if not self.fit_select_features
|
2842
|
+
or c in self.feature_names_
|
2843
|
+
or c in self.search_keys
|
2844
|
+
or c in (self.id_columns or [])
|
2845
|
+
or c in [EVAL_SET_INDEX, TARGET] # transform for metrics calculation
|
2846
|
+
]
|
2847
|
+
elif keep_input is True:
|
2848
|
+
selected_input_columns = validated_Xy.columns.to_list()
|
2849
|
+
else:
|
2850
|
+
selected_input_columns = []
|
2851
|
+
|
2852
|
+
selecting_columns = selected_input_columns + selected_generated_features
|
2853
|
+
selecting_columns.extend(
|
2854
|
+
c for c in result.columns
|
2855
|
+
if c in self.feature_names_ and c not in selecting_columns and c not in validated_Xy.columns
|
2856
|
+
)
|
2946
2857
|
if add_fit_system_record_id:
|
2947
2858
|
selecting_columns.append(SORT_ID)
|
2948
2859
|
|
@@ -2968,29 +2879,7 @@ if response.status_code == 200:
|
|
2968
2879
|
|
2969
2880
|
return result, columns_renaming, generated_features, search_keys
|
2970
2881
|
|
2971
|
-
def
|
2972
|
-
features_info = self._internal_features_info
|
2973
|
-
comm_schema_header = self.bundle.get("features_info_commercial_schema")
|
2974
|
-
shap_value_header = self.bundle.get("features_info_shap")
|
2975
|
-
feature_name_header = self.bundle.get("features_info_name")
|
2976
|
-
external_features = features_info[features_info[comm_schema_header].str.len() > 0]
|
2977
|
-
filtered_features = external_features
|
2978
|
-
if importance_threshold is not None:
|
2979
|
-
filtered_features = filtered_features[filtered_features[shap_value_header] >= importance_threshold]
|
2980
|
-
if max_features is not None and len(filtered_features) > max_features:
|
2981
|
-
filtered_features = filtered_features.iloc[:max_features, :]
|
2982
|
-
if len(filtered_features) == len(external_features):
|
2983
|
-
return []
|
2984
|
-
else:
|
2985
|
-
if len(filtered_features[filtered_features[comm_schema_header].isin([CommercialSchema.PAID.value])]):
|
2986
|
-
return []
|
2987
|
-
excluded_features = external_features[~external_features.index.isin(filtered_features.index)].copy()
|
2988
|
-
excluded_features = excluded_features[
|
2989
|
-
excluded_features[comm_schema_header].isin([CommercialSchema.PAID.value])
|
2990
|
-
]
|
2991
|
-
return excluded_features[feature_name_header].values.tolist()
|
2992
|
-
|
2993
|
-
def __validate_search_keys(self, search_keys: Dict[str, SearchKey], search_id: Optional[str] = None):
|
2882
|
+
def __validate_search_keys(self, search_keys: dict[str, SearchKey], search_id: str | None = None):
|
2994
2883
|
if (search_keys is None or len(search_keys) == 0) and self.country_code is None:
|
2995
2884
|
if search_id:
|
2996
2885
|
self.logger.debug(f"search_id {search_id} provided without search_keys")
|
@@ -3063,23 +2952,22 @@ if response.status_code == 200:
|
|
3063
2952
|
def __inner_fit(
|
3064
2953
|
self,
|
3065
2954
|
trace_id: str,
|
3066
|
-
X:
|
3067
|
-
y:
|
3068
|
-
eval_set:
|
3069
|
-
progress_bar:
|
2955
|
+
X: pd.DataFrame | pd.Series | np.ndarray,
|
2956
|
+
y: pd.DataFrame | pd.Series | np.ndarray | list | None,
|
2957
|
+
eval_set: list[tuple] | None,
|
2958
|
+
progress_bar: ProgressBar | None,
|
3070
2959
|
start_time: int,
|
3071
2960
|
*,
|
3072
|
-
exclude_features_sources:
|
3073
|
-
calculate_metrics:
|
3074
|
-
scoring:
|
3075
|
-
estimator:
|
3076
|
-
importance_threshold: Optional[float],
|
2961
|
+
exclude_features_sources: list[str] | None = None,
|
2962
|
+
calculate_metrics: bool | None,
|
2963
|
+
scoring: Callable | str | None,
|
2964
|
+
estimator: Any | None,
|
3077
2965
|
stability_threshold: float,
|
3078
|
-
|
3079
|
-
remove_outliers_calc_metrics:
|
2966
|
+
stability_agg_func: str,
|
2967
|
+
remove_outliers_calc_metrics: bool | None,
|
3080
2968
|
auto_fe_parameters: AutoFEParameters,
|
3081
|
-
progress_callback:
|
3082
|
-
search_id_callback:
|
2969
|
+
progress_callback: Callable[[SearchProgress], Any] | None = None,
|
2970
|
+
search_id_callback: Callable[[str], Any] | None = None,
|
3083
2971
|
):
|
3084
2972
|
self._search_task = None
|
3085
2973
|
self.warning_counter.reset()
|
@@ -3140,7 +3028,6 @@ if response.status_code == 200:
|
|
3140
3028
|
)
|
3141
3029
|
|
3142
3030
|
df = self.__combine_train_and_eval_sets(validated_X, validated_y, validated_eval_set)
|
3143
|
-
self.id_columns_encoder = OrdinalEncoder().fit(df[self.id_columns or []])
|
3144
3031
|
|
3145
3032
|
self.fit_search_keys = self.search_keys.copy()
|
3146
3033
|
df = self.__handle_index_search_keys(df, self.fit_search_keys)
|
@@ -3148,8 +3035,22 @@ if response.status_code == 200:
|
|
3148
3035
|
|
3149
3036
|
maybe_date_column = SearchKey.find_key(self.fit_search_keys, [SearchKey.DATE, SearchKey.DATETIME])
|
3150
3037
|
has_date = maybe_date_column is not None
|
3038
|
+
|
3151
3039
|
self.model_task_type = self.model_task_type or define_task(validated_y, has_date, self.logger)
|
3152
3040
|
|
3041
|
+
if EVAL_SET_INDEX in df.columns:
|
3042
|
+
only_train_df = df.query(f"{EVAL_SET_INDEX} == 0")
|
3043
|
+
only_train_df = only_train_df.drop(columns=[EVAL_SET_INDEX])
|
3044
|
+
else:
|
3045
|
+
only_train_df = df
|
3046
|
+
|
3047
|
+
self.imbalanced = is_imbalanced(only_train_df, self.model_task_type, self.sample_config, self.bundle)
|
3048
|
+
if self.imbalanced:
|
3049
|
+
# Exclude eval sets from fit because they will be transformed before metrics calculation
|
3050
|
+
df = only_train_df
|
3051
|
+
|
3052
|
+
self.id_columns_encoder = OrdinalEncoder().fit(df[self.id_columns or []])
|
3053
|
+
|
3153
3054
|
self._validate_binary_observations(validated_y, self.model_task_type)
|
3154
3055
|
|
3155
3056
|
self.runtime_parameters = get_runtime_params_custom_loss(
|
@@ -3365,6 +3266,7 @@ if response.status_code == 200:
|
|
3365
3266
|
model_task_type=self.model_task_type,
|
3366
3267
|
cv_type=self.cv,
|
3367
3268
|
id_columns=self.__get_renamed_id_columns(),
|
3269
|
+
is_imbalanced=self.imbalanced,
|
3368
3270
|
date_column=self._get_date_column(self.fit_search_keys),
|
3369
3271
|
date_format=self.date_format,
|
3370
3272
|
random_state=self.random_state,
|
@@ -3444,8 +3346,6 @@ if response.status_code == 200:
|
|
3444
3346
|
if progress_callback is not None:
|
3445
3347
|
progress_callback(progress)
|
3446
3348
|
|
3447
|
-
self.imbalanced = dataset.imbalanced
|
3448
|
-
|
3449
3349
|
zero_hit_search_keys = self._search_task.get_zero_hit_rate_search_keys()
|
3450
3350
|
if zero_hit_search_keys:
|
3451
3351
|
self.logger.warning(
|
@@ -3468,33 +3368,24 @@ if response.status_code == 200:
|
|
3468
3368
|
|
3469
3369
|
self.__prepare_feature_importances(trace_id, df)
|
3470
3370
|
|
3471
|
-
self.__show_selected_features(self.fit_search_keys)
|
3472
|
-
|
3473
|
-
autofe_description = self.get_autofe_features_description()
|
3474
|
-
if autofe_description is not None and len(autofe_description) > 0:
|
3475
|
-
self.logger.info(f"AutoFE descriptions: {autofe_description}")
|
3476
|
-
self.autofe_features_display_handle = display_html_dataframe(
|
3477
|
-
df=autofe_description,
|
3478
|
-
internal_df=autofe_description,
|
3479
|
-
header=self.bundle.get("autofe_descriptions_header"),
|
3480
|
-
display_id=f"autofe_descriptions_{uuid.uuid4()}",
|
3481
|
-
)
|
3482
|
-
|
3483
3371
|
self._select_features_by_psi(
|
3484
3372
|
trace_id=trace_id,
|
3485
3373
|
X=X,
|
3486
3374
|
y=y,
|
3487
3375
|
eval_set=eval_set,
|
3488
3376
|
stability_threshold=stability_threshold,
|
3377
|
+
stability_agg_func=stability_agg_func,
|
3489
3378
|
cv=self.cv,
|
3490
3379
|
estimator=estimator,
|
3491
3380
|
exclude_features_sources=exclude_features_sources,
|
3492
|
-
importance_threshold=importance_threshold,
|
3493
|
-
max_features=max_features,
|
3494
3381
|
progress_bar=progress_bar,
|
3495
3382
|
progress_callback=progress_callback,
|
3496
3383
|
)
|
3497
3384
|
|
3385
|
+
self.__prepare_feature_importances(trace_id, df)
|
3386
|
+
|
3387
|
+
self.__show_selected_features()
|
3388
|
+
|
3498
3389
|
if self._has_paid_features(exclude_features_sources):
|
3499
3390
|
if calculate_metrics is not None and calculate_metrics:
|
3500
3391
|
msg = self.bundle.get("metrics_with_paid_features")
|
@@ -3525,8 +3416,6 @@ if response.status_code == 200:
|
|
3525
3416
|
self.__show_metrics(
|
3526
3417
|
scoring,
|
3527
3418
|
estimator,
|
3528
|
-
importance_threshold,
|
3529
|
-
max_features,
|
3530
3419
|
remove_outliers_calc_metrics,
|
3531
3420
|
trace_id,
|
3532
3421
|
progress_bar,
|
@@ -3543,7 +3432,7 @@ if response.status_code == 200:
|
|
3543
3432
|
if not self.warning_counter.has_warnings():
|
3544
3433
|
self.__display_support_link(self.bundle.get("all_ok_community_invite"))
|
3545
3434
|
|
3546
|
-
def __convert_unnestable_keys(self, df: pd.DataFrame, unnest_search_keys:
|
3435
|
+
def __convert_unnestable_keys(self, df: pd.DataFrame, unnest_search_keys: dict[str, str]):
|
3547
3436
|
email_column = self._get_email_column(self.fit_search_keys)
|
3548
3437
|
hem_column = self._get_hem_column(self.fit_search_keys)
|
3549
3438
|
if email_column:
|
@@ -3575,7 +3464,7 @@ if response.status_code == 200:
|
|
3575
3464
|
def __should_add_date_column(self):
|
3576
3465
|
return self.add_date_if_missing or (self.cv is not None and self.cv.is_time_series())
|
3577
3466
|
|
3578
|
-
def __get_renamed_id_columns(self, renaming:
|
3467
|
+
def __get_renamed_id_columns(self, renaming: dict[str, str] | None = None):
|
3579
3468
|
renaming = renaming or self.fit_columns_renaming
|
3580
3469
|
reverse_renaming = {v: k for k, v in renaming.items()}
|
3581
3470
|
return None if self.id_columns is None else [reverse_renaming.get(c) or c for c in self.id_columns]
|
@@ -3609,7 +3498,7 @@ if response.status_code == 200:
|
|
3609
3498
|
self.cv = cv
|
3610
3499
|
self.runtime_parameters.properties["cv_type"] = self.cv.name
|
3611
3500
|
|
3612
|
-
def get_columns_by_search_keys(self, keys:
|
3501
|
+
def get_columns_by_search_keys(self, keys: list[str]):
|
3613
3502
|
if "HEM" in keys:
|
3614
3503
|
keys.append("EMAIL")
|
3615
3504
|
if "DATE" in keys:
|
@@ -3620,11 +3509,11 @@ if response.status_code == 200:
|
|
3620
3509
|
def _validate_train_eval(
|
3621
3510
|
self,
|
3622
3511
|
X: pd.DataFrame,
|
3623
|
-
y:
|
3624
|
-
eval_set:
|
3512
|
+
y: pd.Series | None = None,
|
3513
|
+
eval_set: list[tuple[pd.DataFrame, pd.Series]] | None = None,
|
3625
3514
|
is_transform: bool = False,
|
3626
3515
|
silent: bool = False,
|
3627
|
-
) ->
|
3516
|
+
) -> tuple[pd.DataFrame, pd.Series, list[tuple[pd.DataFrame, pd.Series]]] | None:
|
3628
3517
|
validated_X = self._validate_X(X, is_transform)
|
3629
3518
|
validated_y = self._validate_y(validated_X, y, enforce_y=not is_transform)
|
3630
3519
|
validated_eval_set = self._validate_eval_set(validated_X, eval_set, silent=silent)
|
@@ -3633,7 +3522,7 @@ if response.status_code == 200:
|
|
3633
3522
|
def _encode_id_columns(
|
3634
3523
|
self,
|
3635
3524
|
X: pd.DataFrame,
|
3636
|
-
) ->
|
3525
|
+
) -> tuple[pd.DataFrame, dict[str, list[Any]]]:
|
3637
3526
|
unknown_dict = {}
|
3638
3527
|
|
3639
3528
|
if self.id_columns and self.id_columns_encoder is not None:
|
@@ -3703,7 +3592,7 @@ if response.status_code == 200:
|
|
3703
3592
|
|
3704
3593
|
return validated_X
|
3705
3594
|
|
3706
|
-
def _validate_y(self, X: pd.DataFrame, y, enforce_y: bool = True) ->
|
3595
|
+
def _validate_y(self, X: pd.DataFrame, y, enforce_y: bool = True) -> pd.Series | None:
|
3707
3596
|
if y is None and not enforce_y:
|
3708
3597
|
return None
|
3709
3598
|
if (
|
@@ -3753,7 +3642,7 @@ if response.status_code == 200:
|
|
3753
3642
|
return validated_y
|
3754
3643
|
|
3755
3644
|
def _validate_eval_set(
|
3756
|
-
self, X: pd.DataFrame, eval_set:
|
3645
|
+
self, X: pd.DataFrame, eval_set: list[tuple[pd.DataFrame, pd.Series]] | None, silent: bool = False
|
3757
3646
|
):
|
3758
3647
|
if eval_set is None:
|
3759
3648
|
return None
|
@@ -3777,7 +3666,7 @@ if response.status_code == 200:
|
|
3777
3666
|
|
3778
3667
|
return validated_eval_set
|
3779
3668
|
|
3780
|
-
def _validate_eval_set_pair(self, X: pd.DataFrame, eval_pair:
|
3669
|
+
def _validate_eval_set_pair(self, X: pd.DataFrame, eval_pair: tuple) -> tuple[pd.DataFrame, pd.Series]:
|
3781
3670
|
if len(eval_pair) != 2:
|
3782
3671
|
raise ValidationError(self.bundle.get("eval_set_invalid_tuple_size").format(len(eval_pair)))
|
3783
3672
|
eval_X, eval_y = eval_pair
|
@@ -3865,7 +3754,7 @@ if response.status_code == 200:
|
|
3865
3754
|
|
3866
3755
|
return validated_eval_X, validated_eval_y
|
3867
3756
|
|
3868
|
-
def _validate_baseline_score(self, X: pd.DataFrame, eval_set:
|
3757
|
+
def _validate_baseline_score(self, X: pd.DataFrame, eval_set: list[tuple] | None):
|
3869
3758
|
if self.baseline_score_column is not None:
|
3870
3759
|
if self.baseline_score_column not in X.columns:
|
3871
3760
|
raise ValidationError(
|
@@ -3885,15 +3774,15 @@ if response.status_code == 200:
|
|
3885
3774
|
raise ValidationError(self.bundle.get("baseline_score_column_has_na"))
|
3886
3775
|
|
3887
3776
|
@staticmethod
|
3888
|
-
def _sample_X_and_y(X: pd.DataFrame, y: pd.Series, enriched_X: pd.DataFrame) ->
|
3777
|
+
def _sample_X_and_y(X: pd.DataFrame, y: pd.Series, enriched_X: pd.DataFrame) -> tuple[pd.DataFrame, pd.Series]:
|
3889
3778
|
Xy = pd.concat([X, y], axis=1)
|
3890
3779
|
Xy = pd.merge(Xy, enriched_X, left_index=True, right_index=True, how="inner", suffixes=("", "enriched"))
|
3891
3780
|
return Xy[X.columns].copy(), Xy[TARGET].copy()
|
3892
3781
|
|
3893
3782
|
@staticmethod
|
3894
3783
|
def _sort_by_system_record_id(
|
3895
|
-
X: pd.DataFrame, y: pd.Series, cv:
|
3896
|
-
) ->
|
3784
|
+
X: pd.DataFrame, y: pd.Series, cv: CVType | None
|
3785
|
+
) -> tuple[pd.DataFrame, pd.Series]:
|
3897
3786
|
if cv not in [CVType.time_series, CVType.blocked_time_series]:
|
3898
3787
|
record_id_column = ENTITY_SYSTEM_RECORD_ID if ENTITY_SYSTEM_RECORD_ID in X else SYSTEM_RECORD_ID
|
3899
3788
|
Xy = X.copy()
|
@@ -3910,8 +3799,8 @@ if response.status_code == 200:
|
|
3910
3799
|
# Deprecated
|
3911
3800
|
@staticmethod
|
3912
3801
|
def _sort_by_keys(
|
3913
|
-
X: pd.DataFrame, y: pd.Series, search_keys:
|
3914
|
-
) ->
|
3802
|
+
X: pd.DataFrame, y: pd.Series, search_keys: dict[str, SearchKey], cv: CVType | None
|
3803
|
+
) -> tuple[pd.DataFrame, pd.Series]:
|
3915
3804
|
if cv not in [CVType.time_series, CVType.blocked_time_series]:
|
3916
3805
|
if DateTimeSearchKeyConverter.DATETIME_COL in X.columns:
|
3917
3806
|
date_column = DateTimeSearchKeyConverter.DATETIME_COL
|
@@ -3950,16 +3839,14 @@ if response.status_code == 200:
|
|
3950
3839
|
def __log_debug_information(
|
3951
3840
|
self,
|
3952
3841
|
X: pd.DataFrame,
|
3953
|
-
y:
|
3954
|
-
eval_set:
|
3955
|
-
exclude_features_sources:
|
3956
|
-
calculate_metrics:
|
3957
|
-
cv:
|
3958
|
-
|
3959
|
-
|
3960
|
-
|
3961
|
-
estimator: Optional[Any] = None,
|
3962
|
-
remove_outliers_calc_metrics: Optional[bool] = None,
|
3842
|
+
y: pd.Series | np.ndarray | list | None = None,
|
3843
|
+
eval_set: list[tuple] | None = None,
|
3844
|
+
exclude_features_sources: list[str] | None = None,
|
3845
|
+
calculate_metrics: bool | None = None,
|
3846
|
+
cv: Any | None = None,
|
3847
|
+
scoring: Any | None = None,
|
3848
|
+
estimator: Any | None = None,
|
3849
|
+
remove_outliers_calc_metrics: bool | None = None,
|
3963
3850
|
):
|
3964
3851
|
try:
|
3965
3852
|
resolved_api_key = self.api_key or os.environ.get(UPGINI_API_KEY)
|
@@ -3972,8 +3859,6 @@ if response.status_code == 200:
|
|
3972
3859
|
f"Runtime parameters: {self.runtime_parameters}\n"
|
3973
3860
|
f"Date format: {self.date_format}\n"
|
3974
3861
|
f"CV: {cv}\n"
|
3975
|
-
f"importance_threshold: {importance_threshold}\n"
|
3976
|
-
f"max_features: {max_features}\n"
|
3977
3862
|
f"Shared datasets: {self.shared_datasets}\n"
|
3978
3863
|
f"Random state: {self.random_state}\n"
|
3979
3864
|
f"Generate features: {self.generate_features}\n"
|
@@ -4037,7 +3922,7 @@ if response.status_code == 200:
|
|
4037
3922
|
except Exception:
|
4038
3923
|
self.logger.warning("Failed to log debug information", exc_info=True)
|
4039
3924
|
|
4040
|
-
def __handle_index_search_keys(self, df: pd.DataFrame, search_keys:
|
3925
|
+
def __handle_index_search_keys(self, df: pd.DataFrame, search_keys: dict[str, SearchKey]) -> pd.DataFrame:
|
4041
3926
|
index_names = df.index.names if df.index.names != [None] else [DEFAULT_INDEX]
|
4042
3927
|
index_search_keys = set(index_names).intersection(search_keys.keys())
|
4043
3928
|
if len(index_search_keys) > 0:
|
@@ -4056,7 +3941,7 @@ if response.status_code == 200:
|
|
4056
3941
|
return df
|
4057
3942
|
|
4058
3943
|
def _add_current_date_as_key(
|
4059
|
-
self, df: pd.DataFrame, search_keys:
|
3944
|
+
self, df: pd.DataFrame, search_keys: dict[str, SearchKey], bundle: ResourceBundle, silent: bool = False
|
4060
3945
|
) -> pd.DataFrame:
|
4061
3946
|
if (
|
4062
3947
|
set(search_keys.values()) == {SearchKey.PHONE}
|
@@ -4073,7 +3958,7 @@ if response.status_code == 200:
|
|
4073
3958
|
return df
|
4074
3959
|
|
4075
3960
|
@staticmethod
|
4076
|
-
def _get_group_columns(df: pd.DataFrame, search_keys:
|
3961
|
+
def _get_group_columns(df: pd.DataFrame, search_keys: dict[str, SearchKey]) -> list[str]:
|
4077
3962
|
search_key_priority = [SearchKey.PHONE, SearchKey.EMAIL, SearchKey.HEM, SearchKey.IP]
|
4078
3963
|
for key_type in search_key_priority:
|
4079
3964
|
if key_type in search_keys.values():
|
@@ -4086,7 +3971,7 @@ if response.status_code == 200:
|
|
4086
3971
|
]
|
4087
3972
|
|
4088
3973
|
@staticmethod
|
4089
|
-
def _get_email_column(search_keys:
|
3974
|
+
def _get_email_column(search_keys: dict[str, SearchKey]) -> str | None:
|
4090
3975
|
cols = [col for col, t in search_keys.items() if t == SearchKey.EMAIL]
|
4091
3976
|
if len(cols) > 1:
|
4092
3977
|
raise Exception("More than one email column found after unnest")
|
@@ -4094,7 +3979,7 @@ if response.status_code == 200:
|
|
4094
3979
|
return cols[0]
|
4095
3980
|
|
4096
3981
|
@staticmethod
|
4097
|
-
def _get_hem_column(search_keys:
|
3982
|
+
def _get_hem_column(search_keys: dict[str, SearchKey]) -> str | None:
|
4098
3983
|
cols = [col for col, t in search_keys.items() if t == SearchKey.HEM]
|
4099
3984
|
if len(cols) > 1:
|
4100
3985
|
raise Exception("More than one hem column found after unnest")
|
@@ -4102,7 +3987,7 @@ if response.status_code == 200:
|
|
4102
3987
|
return cols[0]
|
4103
3988
|
|
4104
3989
|
@staticmethod
|
4105
|
-
def _get_ip_column(search_keys:
|
3990
|
+
def _get_ip_column(search_keys: dict[str, SearchKey]) -> str | None:
|
4106
3991
|
cols = [col for col, t in search_keys.items() if t == SearchKey.IP]
|
4107
3992
|
if len(cols) > 1:
|
4108
3993
|
raise Exception("More than one ip column found after unnest")
|
@@ -4110,32 +3995,32 @@ if response.status_code == 200:
|
|
4110
3995
|
return cols[0]
|
4111
3996
|
|
4112
3997
|
@staticmethod
|
4113
|
-
def _get_phone_column(search_keys:
|
3998
|
+
def _get_phone_column(search_keys: dict[str, SearchKey]) -> str | None:
|
4114
3999
|
for col, t in search_keys.items():
|
4115
4000
|
if t == SearchKey.PHONE:
|
4116
4001
|
return col
|
4117
4002
|
|
4118
4003
|
@staticmethod
|
4119
|
-
def _get_country_column(search_keys:
|
4004
|
+
def _get_country_column(search_keys: dict[str, SearchKey]) -> str | None:
|
4120
4005
|
for col, t in search_keys.items():
|
4121
4006
|
if t == SearchKey.COUNTRY:
|
4122
4007
|
return col
|
4123
4008
|
|
4124
4009
|
@staticmethod
|
4125
|
-
def _get_postal_column(search_keys:
|
4010
|
+
def _get_postal_column(search_keys: dict[str, SearchKey]) -> str | None:
|
4126
4011
|
for col, t in search_keys.items():
|
4127
4012
|
if t == SearchKey.POSTAL_CODE:
|
4128
4013
|
return col
|
4129
4014
|
|
4130
4015
|
@staticmethod
|
4131
|
-
def _get_date_column(search_keys:
|
4016
|
+
def _get_date_column(search_keys: dict[str, SearchKey]) -> str | None:
|
4132
4017
|
return SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
|
4133
4018
|
|
4134
4019
|
def _explode_multiple_search_keys(
|
4135
|
-
self, df: pd.DataFrame, search_keys:
|
4136
|
-
) ->
|
4020
|
+
self, df: pd.DataFrame, search_keys: dict[str, SearchKey], columns_renaming: dict[str, str]
|
4021
|
+
) -> tuple[pd.DataFrame, dict[str, list[str]]]:
|
4137
4022
|
# find groups of multiple search keys
|
4138
|
-
search_key_names_by_type:
|
4023
|
+
search_key_names_by_type: dict[SearchKey, list[str]] = {}
|
4139
4024
|
for key_name, key_type in search_keys.items():
|
4140
4025
|
search_key_names_by_type[key_type] = search_key_names_by_type.get(key_type, []) + [key_name]
|
4141
4026
|
search_key_names_by_type = {
|
@@ -4171,14 +4056,14 @@ if response.status_code == 200:
|
|
4171
4056
|
@staticmethod
|
4172
4057
|
def _add_fit_system_record_id(
|
4173
4058
|
df: pd.DataFrame,
|
4174
|
-
search_keys:
|
4059
|
+
search_keys: dict[str, SearchKey],
|
4175
4060
|
id_name: str,
|
4176
4061
|
target_name: str,
|
4177
|
-
columns_renaming:
|
4178
|
-
id_columns:
|
4179
|
-
cv:
|
4062
|
+
columns_renaming: dict[str, str],
|
4063
|
+
id_columns: list[str] | None,
|
4064
|
+
cv: CVType | None,
|
4180
4065
|
model_task_type: ModelTaskType,
|
4181
|
-
logger:
|
4066
|
+
logger: logging.Logger | None = None,
|
4182
4067
|
bundle: ResourceBundle = bundle,
|
4183
4068
|
) -> pd.DataFrame:
|
4184
4069
|
original_index_name = df.index.name
|
@@ -4296,7 +4181,7 @@ if response.status_code == 200:
|
|
4296
4181
|
|
4297
4182
|
return df
|
4298
4183
|
|
4299
|
-
def __add_country_code(self, df: pd.DataFrame, search_keys:
|
4184
|
+
def __add_country_code(self, df: pd.DataFrame, search_keys: dict[str, SearchKey]) -> pd.DataFrame:
|
4300
4185
|
self.country_added = False
|
4301
4186
|
|
4302
4187
|
if self.country_code is not None and SearchKey.COUNTRY not in search_keys.values():
|
@@ -4314,7 +4199,7 @@ if response.status_code == 200:
|
|
4314
4199
|
def __enrich(
|
4315
4200
|
self,
|
4316
4201
|
input_df: pd.DataFrame,
|
4317
|
-
result_features:
|
4202
|
+
result_features: pd.DataFrame | None,
|
4318
4203
|
how: str = "inner",
|
4319
4204
|
drop_system_record_id=True,
|
4320
4205
|
) -> pd.DataFrame:
|
@@ -4422,7 +4307,7 @@ if response.status_code == 200:
|
|
4422
4307
|
|
4423
4308
|
return importances
|
4424
4309
|
|
4425
|
-
def __get_categorical_features(self) ->
|
4310
|
+
def __get_categorical_features(self) -> list[str]:
|
4426
4311
|
features_meta = self._search_task.get_all_features_metadata_v2()
|
4427
4312
|
if features_meta is None:
|
4428
4313
|
raise Exception(self.bundle.get("missing_features_meta"))
|
@@ -4433,11 +4318,13 @@ if response.status_code == 200:
|
|
4433
4318
|
self,
|
4434
4319
|
trace_id: str,
|
4435
4320
|
clients_features_df: pd.DataFrame,
|
4436
|
-
updated_shaps:
|
4321
|
+
updated_shaps: dict[str, float] | None = None,
|
4322
|
+
update_selected_features: bool = True,
|
4437
4323
|
silent=False,
|
4438
4324
|
):
|
4439
4325
|
if self._search_task is None:
|
4440
4326
|
raise NotFittedError(self.bundle.get("transform_unfitted_enricher"))
|
4327
|
+
selected_features = self._search_task.get_selected_features(trace_id)
|
4441
4328
|
features_meta = self._search_task.get_all_features_metadata_v2()
|
4442
4329
|
if features_meta is None:
|
4443
4330
|
raise Exception(self.bundle.get("missing_features_meta"))
|
@@ -4451,8 +4338,6 @@ if response.status_code == 200:
|
|
4451
4338
|
|
4452
4339
|
self.feature_names_ = []
|
4453
4340
|
self.external_source_feature_names = []
|
4454
|
-
self.zero_shap_client_features = []
|
4455
|
-
self.unstable_client_features = []
|
4456
4341
|
self.feature_importances_ = []
|
4457
4342
|
features_info = []
|
4458
4343
|
features_info_without_links = []
|
@@ -4460,12 +4345,19 @@ if response.status_code == 200:
|
|
4460
4345
|
|
4461
4346
|
original_shaps = {original_names_dict.get(fm.name, fm.name): fm.shap_value for fm in features_meta}
|
4462
4347
|
|
4348
|
+
selected_features_meta = []
|
4463
4349
|
for feature_meta in features_meta:
|
4464
4350
|
original_name = original_names_dict.get(feature_meta.name, feature_meta.name)
|
4465
4351
|
feature_meta.name = original_name
|
4466
4352
|
|
4467
4353
|
is_client_feature = original_name in clients_features_df.columns
|
4468
4354
|
|
4355
|
+
if selected_features is not None and feature_meta.name not in selected_features:
|
4356
|
+
self.logger.info(f"Feature {feature_meta.name} is not selected before and skipped")
|
4357
|
+
continue
|
4358
|
+
|
4359
|
+
selected_features_meta.append(feature_meta)
|
4360
|
+
|
4469
4361
|
# Show and update shap values for client features only if select_features is True
|
4470
4362
|
if updated_shaps is not None and (not is_client_feature or self.fit_select_features):
|
4471
4363
|
updating_shap = updated_shaps.get(feature_meta.name)
|
@@ -4477,9 +4369,9 @@ if response.status_code == 200:
|
|
4477
4369
|
updating_shap = 0.0
|
4478
4370
|
feature_meta.shap_value = updating_shap
|
4479
4371
|
|
4480
|
-
|
4372
|
+
selected_features_meta.sort(key=lambda m: (-m.shap_value, m.name))
|
4481
4373
|
|
4482
|
-
for feature_meta in
|
4374
|
+
for feature_meta in selected_features_meta:
|
4483
4375
|
original_name = original_names_dict.get(feature_meta.name, feature_meta.name)
|
4484
4376
|
is_client_feature = original_name in clients_features_df.columns
|
4485
4377
|
|
@@ -4490,15 +4382,11 @@ if response.status_code == 200:
|
|
4490
4382
|
if original_name in self.psi_values:
|
4491
4383
|
feature_meta.psi_value = self.psi_values[original_name]
|
4492
4384
|
else:
|
4493
|
-
if is_client_feature and self.fit_select_features:
|
4494
|
-
self.unstable_client_features.append(original_name)
|
4495
4385
|
continue
|
4496
4386
|
|
4497
4387
|
# TODO make a decision about selected features based on special flag from mlb
|
4498
4388
|
|
4499
4389
|
if original_shaps.get(feature_meta.name, 0.0) == 0.0:
|
4500
|
-
if is_client_feature and self.fit_select_features:
|
4501
|
-
self.zero_shap_client_features.append(original_name)
|
4502
4390
|
continue
|
4503
4391
|
|
4504
4392
|
# Use only important features
|
@@ -4525,6 +4413,9 @@ if response.status_code == 200:
|
|
4525
4413
|
features_info_without_links.append(feature_info.to_row_without_links(self.bundle))
|
4526
4414
|
internal_features_info.append(feature_info.to_internal_row(self.bundle))
|
4527
4415
|
|
4416
|
+
if update_selected_features:
|
4417
|
+
self._search_task.update_selected_features(trace_id, self.feature_names_)
|
4418
|
+
|
4528
4419
|
if len(features_info) > 0:
|
4529
4420
|
self.features_info = pd.DataFrame(features_info)
|
4530
4421
|
if self.features_info[self.bundle.get("features_info_psi")].isna().all():
|
@@ -4652,32 +4543,10 @@ if response.status_code == 200:
|
|
4652
4543
|
)
|
4653
4544
|
)
|
4654
4545
|
|
4655
|
-
def __filtered_importance_names(
|
4656
|
-
self, importance_threshold: Optional[float], max_features: Optional[int], trace_id: str, df: pd.DataFrame
|
4657
|
-
) -> List[str]:
|
4658
|
-
# get features importance from server
|
4659
|
-
filtered_importances = self.__get_features_importance_from_server(trace_id, df)
|
4660
|
-
|
4661
|
-
if len(filtered_importances) == 0:
|
4662
|
-
return []
|
4663
|
-
|
4664
|
-
if importance_threshold is not None:
|
4665
|
-
filtered_importances = [
|
4666
|
-
(name, importance)
|
4667
|
-
for name, importance in filtered_importances.items()
|
4668
|
-
if importance > importance_threshold
|
4669
|
-
]
|
4670
|
-
if max_features is not None:
|
4671
|
-
filtered_importances = list(filtered_importances)[:max_features]
|
4672
|
-
if len(filtered_importances) == 0:
|
4673
|
-
return []
|
4674
|
-
filtered_importance_names, _ = zip(*filtered_importances)
|
4675
|
-
return list(filtered_importance_names)
|
4676
|
-
|
4677
4546
|
def __prepare_search_keys(
|
4678
4547
|
self,
|
4679
4548
|
x: pd.DataFrame,
|
4680
|
-
search_keys:
|
4549
|
+
search_keys: dict[str, SearchKey],
|
4681
4550
|
is_demo_dataset: bool,
|
4682
4551
|
is_transform=False,
|
4683
4552
|
silent_mode=False,
|
@@ -4788,20 +4657,16 @@ if response.status_code == 200:
|
|
4788
4657
|
|
4789
4658
|
def __show_metrics(
|
4790
4659
|
self,
|
4791
|
-
scoring:
|
4792
|
-
estimator:
|
4793
|
-
|
4794
|
-
max_features: Optional[int],
|
4795
|
-
remove_outliers_calc_metrics: Optional[bool],
|
4660
|
+
scoring: Callable | str | None,
|
4661
|
+
estimator: Any | None,
|
4662
|
+
remove_outliers_calc_metrics: bool | None,
|
4796
4663
|
trace_id: str,
|
4797
|
-
progress_bar:
|
4798
|
-
progress_callback:
|
4664
|
+
progress_bar: ProgressBar | None = None,
|
4665
|
+
progress_callback: Callable[[SearchProgress], Any] | None = None,
|
4799
4666
|
):
|
4800
4667
|
self.metrics = self.calculate_metrics(
|
4801
4668
|
scoring=scoring,
|
4802
4669
|
estimator=estimator,
|
4803
|
-
importance_threshold=importance_threshold,
|
4804
|
-
max_features=max_features,
|
4805
4670
|
remove_outliers_calc_metrics=remove_outliers_calc_metrics,
|
4806
4671
|
trace_id=trace_id,
|
4807
4672
|
internal_call=True,
|
@@ -4812,22 +4677,15 @@ if response.status_code == 200:
|
|
4812
4677
|
msg = self.bundle.get("quality_metrics_header")
|
4813
4678
|
display_html_dataframe(self.metrics, self.metrics, msg)
|
4814
4679
|
|
4815
|
-
def __show_selected_features(self
|
4816
|
-
search_key_names = [col for col, tpe in search_keys.items() if tpe != SearchKey.CUSTOM_KEY]
|
4817
|
-
if self.fit_columns_renaming:
|
4818
|
-
search_key_names = sorted(set([self.fit_columns_renaming.get(col, col) for col in search_key_names]))
|
4819
|
-
msg = self.bundle.get("features_info_header").format(len(self.feature_names_), search_key_names)
|
4820
|
-
|
4680
|
+
def __show_selected_features(self):
|
4821
4681
|
try:
|
4822
4682
|
_ = get_ipython() # type: ignore
|
4823
4683
|
|
4824
|
-
print(Format.GREEN + Format.BOLD + msg + Format.END)
|
4825
|
-
self.logger.info(msg)
|
4826
4684
|
if len(self.feature_names_) > 0:
|
4827
4685
|
self.features_info_display_handle = display_html_dataframe(
|
4828
4686
|
self.features_info,
|
4829
4687
|
self._features_info_without_links,
|
4830
|
-
self.bundle.get("relevant_features_header"),
|
4688
|
+
self.bundle.get("relevant_features_header").format(len(self.feature_names_)),
|
4831
4689
|
display_id=f"features_info_{uuid.uuid4()}",
|
4832
4690
|
)
|
4833
4691
|
|
@@ -4838,14 +4696,23 @@ if response.status_code == 200:
|
|
4838
4696
|
self.bundle.get("relevant_data_sources_header"),
|
4839
4697
|
display_id=f"data_sources_{uuid.uuid4()}",
|
4840
4698
|
)
|
4699
|
+
|
4700
|
+
autofe_description = self.get_autofe_features_description()
|
4701
|
+
if autofe_description is not None and len(autofe_description) > 0:
|
4702
|
+
self.logger.info(f"AutoFE descriptions: {autofe_description}")
|
4703
|
+
self.autofe_features_display_handle = display_html_dataframe(
|
4704
|
+
df=autofe_description,
|
4705
|
+
internal_df=autofe_description,
|
4706
|
+
header=self.bundle.get("autofe_descriptions_header"),
|
4707
|
+
display_id=f"autofe_descriptions_{uuid.uuid4()}",
|
4708
|
+
)
|
4841
4709
|
else:
|
4842
4710
|
msg = self.bundle.get("features_info_zero_important_features")
|
4843
4711
|
self.__log_warning(msg, show_support_link=True)
|
4844
4712
|
except (ImportError, NameError):
|
4845
|
-
print(msg)
|
4846
4713
|
print(self._internal_features_info)
|
4847
4714
|
|
4848
|
-
def __show_report_button(self, display_id:
|
4715
|
+
def __show_report_button(self, display_id: str | None = None, display_handle=None):
|
4849
4716
|
try:
|
4850
4717
|
return prepare_and_show_report(
|
4851
4718
|
relevant_features_df=self._features_info_without_links,
|
@@ -4861,40 +4728,14 @@ if response.status_code == 200:
|
|
4861
4728
|
except Exception:
|
4862
4729
|
pass
|
4863
4730
|
|
4864
|
-
def __validate_importance_threshold(self, importance_threshold: Optional[float]) -> float:
|
4865
|
-
try:
|
4866
|
-
return float(importance_threshold) if importance_threshold is not None else 0.0
|
4867
|
-
except ValueError:
|
4868
|
-
self.logger.exception(f"Invalid importance_threshold provided: {importance_threshold}")
|
4869
|
-
raise ValidationError(self.bundle.get("invalid_importance_threshold"))
|
4870
|
-
|
4871
|
-
def __validate_max_features(self, max_features: Optional[int]) -> int:
|
4872
|
-
try:
|
4873
|
-
return int(max_features) if max_features is not None else 400
|
4874
|
-
except ValueError:
|
4875
|
-
self.logger.exception(f"Invalid max_features provided: {max_features}")
|
4876
|
-
raise ValidationError(self.bundle.get("invalid_max_features"))
|
4877
|
-
|
4878
|
-
def __filtered_enriched_features(
|
4879
|
-
self,
|
4880
|
-
importance_threshold: Optional[float],
|
4881
|
-
max_features: Optional[int],
|
4882
|
-
trace_id: str,
|
4883
|
-
df: pd.DataFrame,
|
4884
|
-
) -> List[str]:
|
4885
|
-
importance_threshold = self.__validate_importance_threshold(importance_threshold)
|
4886
|
-
max_features = self.__validate_max_features(max_features)
|
4887
|
-
|
4888
|
-
return self.__filtered_importance_names(importance_threshold, max_features, trace_id, df)
|
4889
|
-
|
4890
4731
|
def __detect_missing_search_keys(
|
4891
4732
|
self,
|
4892
4733
|
df: pd.DataFrame,
|
4893
|
-
search_keys:
|
4734
|
+
search_keys: dict[str, SearchKey],
|
4894
4735
|
is_demo_dataset: bool,
|
4895
4736
|
silent_mode=False,
|
4896
4737
|
is_transform=False,
|
4897
|
-
) ->
|
4738
|
+
) -> dict[str, SearchKey]:
|
4898
4739
|
sample = df.head(100)
|
4899
4740
|
|
4900
4741
|
def check_need_detect(search_key: SearchKey):
|
@@ -5011,7 +4852,7 @@ if response.status_code == 200:
|
|
5011
4852
|
except Exception:
|
5012
4853
|
self.logger.exception("Failed to dump python libs")
|
5013
4854
|
|
5014
|
-
def __display_support_link(self, link_text:
|
4855
|
+
def __display_support_link(self, link_text: str | None = None):
|
5015
4856
|
support_link = self.bundle.get("support_link")
|
5016
4857
|
link_text = link_text or self.bundle.get("support_text")
|
5017
4858
|
try:
|
@@ -5038,9 +4879,9 @@ if response.status_code == 200:
|
|
5038
4879
|
def dump_input(
|
5039
4880
|
self,
|
5040
4881
|
trace_id: str,
|
5041
|
-
X:
|
5042
|
-
y:
|
5043
|
-
eval_set:
|
4882
|
+
X: pd.DataFrame | pd.Series,
|
4883
|
+
y: pd.DataFrame | pd.Series | None = None,
|
4884
|
+
eval_set: tuple | None = None,
|
5044
4885
|
):
|
5045
4886
|
def dump_task(X_, y_, eval_set_):
|
5046
4887
|
with MDC(trace_id=trace_id):
|
@@ -5131,28 +4972,10 @@ def is_frames_equal(first, second, bundle: ResourceBundle) -> bool:
|
|
5131
4972
|
raise ValidationError(bundle.get("x_and_eval_x_diff_types").format(type(first), type(second)))
|
5132
4973
|
|
5133
4974
|
|
5134
|
-
def drop_duplicates(df:
|
4975
|
+
def drop_duplicates(df: pd.DataFrame | np.ndarray | Any) -> pd.DataFrame:
|
5135
4976
|
if isinstance(df, pd.DataFrame):
|
5136
4977
|
return df.drop_duplicates()
|
5137
4978
|
elif isinstance(df, np.ndarray):
|
5138
4979
|
return pd.DataFrame(df).drop_duplicates()
|
5139
4980
|
else:
|
5140
4981
|
return df
|
5141
|
-
|
5142
|
-
|
5143
|
-
def hash_input(X: pd.DataFrame, y: Optional[pd.Series] = None, eval_set: Optional[List[Tuple]] = None) -> str:
|
5144
|
-
hashed_objects = []
|
5145
|
-
try:
|
5146
|
-
hashed_objects.append(pd.util.hash_pandas_object(X, index=False).values)
|
5147
|
-
if y is not None:
|
5148
|
-
hashed_objects.append(pd.util.hash_pandas_object(y, index=False).values)
|
5149
|
-
if eval_set is not None:
|
5150
|
-
if isinstance(eval_set, tuple):
|
5151
|
-
eval_set = [eval_set]
|
5152
|
-
for eval_X, eval_y in eval_set:
|
5153
|
-
hashed_objects.append(pd.util.hash_pandas_object(eval_X, index=False).values)
|
5154
|
-
hashed_objects.append(pd.util.hash_pandas_object(eval_y, index=False).values)
|
5155
|
-
common_hash = hashlib.sha256(np.concatenate(hashed_objects)).hexdigest()
|
5156
|
-
return common_hash
|
5157
|
-
except Exception:
|
5158
|
-
return ""
|