upgini 1.2.86.dev1__py3-none-any.whl → 1.2.87__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- upgini/__about__.py +1 -1
- upgini/data_source/data_source_publisher.py +21 -0
- upgini/features_enricher.py +91 -41
- upgini/metrics.py +103 -41
- upgini/resource_bundle/strings.properties +3 -1
- upgini/utils/datetime_utils.py +130 -118
- upgini/utils/deduplicate_utils.py +4 -4
- upgini/utils/email_utils.py +5 -5
- upgini/utils/sklearn_ext.py +112 -8
- {upgini-1.2.86.dev1.dist-info → upgini-1.2.87.dist-info}/METADATA +1 -1
- {upgini-1.2.86.dev1.dist-info → upgini-1.2.87.dist-info}/RECORD +13 -13
- {upgini-1.2.86.dev1.dist-info → upgini-1.2.87.dist-info}/WHEEL +0 -0
- {upgini-1.2.86.dev1.dist-info → upgini-1.2.87.dist-info}/licenses/LICENSE +0 -0
upgini/__about__.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "1.2.
|
1
|
+
__version__ = "1.2.87"
|
@@ -5,6 +5,8 @@ from datetime import datetime
|
|
5
5
|
from enum import Enum
|
6
6
|
from typing import Dict, List, Literal, Optional, Union
|
7
7
|
|
8
|
+
import pandas as pd
|
9
|
+
|
8
10
|
from upgini.errors import HttpError, ValidationError
|
9
11
|
from upgini.http import LoggerFactory, get_rest_client
|
10
12
|
from upgini.mdc import MDC
|
@@ -137,6 +139,25 @@ class DataSourcePublisher:
|
|
137
139
|
) and not date_format:
|
138
140
|
raise ValidationError("date_format argument is required for PHONE+DATE and HEM+DATE search keys")
|
139
141
|
|
142
|
+
if secondary_search_keys:
|
143
|
+
response = self._rest_client.get_active_ads_definitions()
|
144
|
+
definitions = pd.DataFrame(response["adsDefinitions"])
|
145
|
+
prod_secondary_definitions = definitions.query(
|
146
|
+
"(secondarySearchKeys.astype('string') != '[]') & (adsDefinitionAccessType == 'PROD')"
|
147
|
+
)[["name", "searchKeys", "secondarySearchKeys"]]
|
148
|
+
for _, row in prod_secondary_definitions.iterrows():
|
149
|
+
existing_secondary_keys = {item for sublist in row["secondarySearchKeys"] for item in sublist}
|
150
|
+
if existing_secondary_keys == {v.value.name for v in secondary_search_keys.values()}:
|
151
|
+
existing_search_keys = {item for sublist in row["searchKeys"] for item in sublist}
|
152
|
+
if (
|
153
|
+
existing_search_keys == {v.value.name for v in search_keys.values()}
|
154
|
+
or ("IP" in str(existing_search_keys) and "IP" in str(search_keys.values()))
|
155
|
+
):
|
156
|
+
raise ValidationError(
|
157
|
+
"ADS with the same PRIMARY_KEYS -> SECONDARY_KEYS mapping "
|
158
|
+
f"already exists: {row['name']}"
|
159
|
+
)
|
160
|
+
|
140
161
|
request = {
|
141
162
|
"dataTableUri": data_table_uri,
|
142
163
|
"searchKeys": {k: v.value.value for k, v in search_keys.items()},
|
upgini/features_enricher.py
CHANGED
@@ -30,7 +30,7 @@ from pandas.api.types import (
|
|
30
30
|
from scipy.stats import ks_2samp
|
31
31
|
from sklearn.base import TransformerMixin
|
32
32
|
from sklearn.exceptions import NotFittedError
|
33
|
-
from sklearn.model_selection import BaseCrossValidator
|
33
|
+
from sklearn.model_selection import BaseCrossValidator, TimeSeriesSplit
|
34
34
|
|
35
35
|
from upgini.autofe.feature import Feature
|
36
36
|
from upgini.autofe.timeseries import TimeSeriesBase
|
@@ -71,6 +71,7 @@ from upgini.resource_bundle import ResourceBundle, bundle, get_custom_bundle
|
|
71
71
|
from upgini.search_task import SearchTask
|
72
72
|
from upgini.spinner import Spinner
|
73
73
|
from upgini.utils import combine_search_keys, find_numbers_with_decimal_comma
|
74
|
+
from upgini.utils.blocked_time_series import BlockedTimeSeriesSplit
|
74
75
|
from upgini.utils.country_utils import (
|
75
76
|
CountrySearchKeyConverter,
|
76
77
|
CountrySearchKeyDetector,
|
@@ -114,7 +115,9 @@ from upgini.utils.postal_code_utils import (
|
|
114
115
|
try:
|
115
116
|
from upgini.utils.progress_bar import CustomProgressBar as ProgressBar
|
116
117
|
except Exception:
|
117
|
-
from upgini.utils.fallback_progress_bar import
|
118
|
+
from upgini.utils.fallback_progress_bar import (
|
119
|
+
CustomFallbackProgressBar as ProgressBar,
|
120
|
+
)
|
118
121
|
|
119
122
|
from upgini.utils.sort import sort_columns
|
120
123
|
from upgini.utils.target_utils import (
|
@@ -239,6 +242,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
239
242
|
add_date_if_missing: bool = True,
|
240
243
|
disable_force_downsampling: bool = False,
|
241
244
|
id_columns: Optional[List[str]] = None,
|
245
|
+
generate_search_key_features: bool = True,
|
242
246
|
**kwargs,
|
243
247
|
):
|
244
248
|
self.bundle = get_custom_bundle(custom_bundle_config)
|
@@ -296,7 +300,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
296
300
|
self._relevant_data_sources_wo_links: pd.DataFrame = self.EMPTY_DATA_SOURCES
|
297
301
|
self.metrics: Optional[pd.DataFrame] = None
|
298
302
|
self.feature_names_ = []
|
299
|
-
self.
|
303
|
+
self.zero_shap_client_features = []
|
300
304
|
self.feature_importances_ = []
|
301
305
|
self.search_id = search_id
|
302
306
|
self.disable_force_downsampling = disable_force_downsampling
|
@@ -311,7 +315,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
311
315
|
self.logger.debug(f"FeaturesEnricher created from existing search: {search_id}")
|
312
316
|
self._search_task = search_task.poll_result(trace_id, quiet=True, check_fit=True)
|
313
317
|
file_metadata = self._search_task.get_file_metadata(trace_id)
|
314
|
-
x_columns = [c.
|
318
|
+
x_columns = [c.name for c in file_metadata.columns]
|
315
319
|
self.fit_columns_renaming = {c.name: c.originalName for c in file_metadata.columns}
|
316
320
|
df = pd.DataFrame(columns=x_columns)
|
317
321
|
self.__prepare_feature_importances(trace_id, df, silent=True)
|
@@ -365,6 +369,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
365
369
|
self.exclude_columns = exclude_columns
|
366
370
|
self.baseline_score_column = baseline_score_column
|
367
371
|
self.add_date_if_missing = add_date_if_missing
|
372
|
+
self.generate_search_key_features = generate_search_key_features
|
373
|
+
|
368
374
|
self.features_info_display_handle = None
|
369
375
|
self.data_sources_display_handle = None
|
370
376
|
self.autofe_features_display_handle = None
|
@@ -1045,6 +1051,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
1045
1051
|
self._check_train_and_eval_target_distribution(y_sorted, fitting_eval_set_dict)
|
1046
1052
|
|
1047
1053
|
has_date = self._get_date_column(search_keys) is not None
|
1054
|
+
has_time = has_date and isinstance(_cv, TimeSeriesSplit) or isinstance(_cv, BlockedTimeSeriesSplit)
|
1048
1055
|
model_task_type = self.model_task_type or define_task(y_sorted, has_date, self.logger, silent=True)
|
1049
1056
|
cat_features = list(set(client_cat_features + cat_features_from_backend))
|
1050
1057
|
baseline_cat_features = [f for f in cat_features if f in fitting_X.columns]
|
@@ -1077,7 +1084,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
1077
1084
|
add_params=custom_loss_add_params,
|
1078
1085
|
groups=groups,
|
1079
1086
|
text_features=text_features,
|
1080
|
-
|
1087
|
+
has_time=has_time,
|
1081
1088
|
)
|
1082
1089
|
baseline_cv_result = baseline_estimator.cross_val_predict(
|
1083
1090
|
fitting_X, y_sorted, baseline_score_column
|
@@ -1112,7 +1119,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
1112
1119
|
add_params=custom_loss_add_params,
|
1113
1120
|
groups=groups,
|
1114
1121
|
text_features=text_features,
|
1115
|
-
|
1122
|
+
has_time=has_time,
|
1116
1123
|
)
|
1117
1124
|
enriched_cv_result = enriched_estimator.cross_val_predict(fitting_enriched_X, enriched_y_sorted)
|
1118
1125
|
enriched_metric = enriched_cv_result.get_display_metric()
|
@@ -1773,7 +1780,13 @@ class FeaturesEnricher(TransformerMixin):
|
|
1773
1780
|
date_column = self._get_date_column(search_keys)
|
1774
1781
|
generated_features = []
|
1775
1782
|
if date_column is not None:
|
1776
|
-
converter = DateTimeSearchKeyConverter(
|
1783
|
+
converter = DateTimeSearchKeyConverter(
|
1784
|
+
date_column,
|
1785
|
+
self.date_format,
|
1786
|
+
self.logger,
|
1787
|
+
self.bundle,
|
1788
|
+
generate_cyclical_features=self.generate_search_key_features,
|
1789
|
+
)
|
1777
1790
|
# Leave original date column values
|
1778
1791
|
df_with_date_features = converter.convert(df, keep_time=True)
|
1779
1792
|
df_with_date_features[date_column] = df[date_column]
|
@@ -1781,7 +1794,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
1781
1794
|
generated_features = converter.generated_features
|
1782
1795
|
|
1783
1796
|
email_columns = SearchKey.find_all_keys(search_keys, SearchKey.EMAIL)
|
1784
|
-
if email_columns:
|
1797
|
+
if email_columns and self.generate_search_key_features:
|
1785
1798
|
generator = EmailDomainGenerator(email_columns)
|
1786
1799
|
df = generator.generate(df)
|
1787
1800
|
generated_features.extend(generator.generated_features)
|
@@ -2204,10 +2217,12 @@ class FeaturesEnricher(TransformerMixin):
|
|
2204
2217
|
{"name": name, "value": key_example(sk_type)} for name in sk_meta.unnestKeyNames
|
2205
2218
|
]
|
2206
2219
|
else:
|
2207
|
-
search_keys_with_values[sk_type.name] = [
|
2208
|
-
|
2209
|
-
|
2210
|
-
|
2220
|
+
search_keys_with_values[sk_type.name] = [
|
2221
|
+
{
|
2222
|
+
"name": sk_meta.originalName,
|
2223
|
+
"value": key_example(sk_type),
|
2224
|
+
}
|
2225
|
+
]
|
2211
2226
|
|
2212
2227
|
keys_section = json.dumps(search_keys_with_values)
|
2213
2228
|
features_for_transform = self._search_task.get_features_for_transform()
|
@@ -2284,11 +2299,16 @@ if response.status_code == 200:
|
|
2284
2299
|
|
2285
2300
|
self.__log_debug_information(validated_X, validated_y, exclude_features_sources=exclude_features_sources)
|
2286
2301
|
|
2287
|
-
self.
|
2302
|
+
filtered_columns = self.__filtered_enriched_features(
|
2303
|
+
importance_threshold, max_features, trace_id, validated_X
|
2304
|
+
)
|
2305
|
+
# If there are no important features, return original dataframe
|
2306
|
+
if not filtered_columns:
|
2307
|
+
msg = self.bundle.get("no_important_features_for_transform")
|
2308
|
+
self.__log_warning(msg, show_support_link=True)
|
2309
|
+
return X, {c: c for c in X.columns}, [], dict()
|
2288
2310
|
|
2289
|
-
|
2290
|
-
self.logger.warning(self.bundle.get("no_important_features_for_transform"))
|
2291
|
-
return X, {c: c for c in X.columns}, [], {}
|
2311
|
+
self.__validate_search_keys(self.search_keys, self.search_id)
|
2292
2312
|
|
2293
2313
|
if self._has_paid_features(exclude_features_sources):
|
2294
2314
|
msg = self.bundle.get("transform_with_paid_features")
|
@@ -2327,9 +2347,7 @@ if response.status_code == 200:
|
|
2327
2347
|
|
2328
2348
|
is_demo_dataset = hash_input(df) in DEMO_DATASET_HASHES
|
2329
2349
|
|
2330
|
-
columns_to_drop = [
|
2331
|
-
c for c in df.columns if c in self.feature_names_ and c in self.dropped_client_feature_names_
|
2332
|
-
]
|
2350
|
+
columns_to_drop = [c for c in df.columns if c in self.feature_names_]
|
2333
2351
|
if len(columns_to_drop) > 0:
|
2334
2352
|
msg = self.bundle.get("x_contains_enriching_columns").format(columns_to_drop)
|
2335
2353
|
self.logger.warning(msg)
|
@@ -2360,7 +2378,13 @@ if response.status_code == 200:
|
|
2360
2378
|
generated_features = []
|
2361
2379
|
date_column = self._get_date_column(search_keys)
|
2362
2380
|
if date_column is not None:
|
2363
|
-
converter = DateTimeSearchKeyConverter(
|
2381
|
+
converter = DateTimeSearchKeyConverter(
|
2382
|
+
date_column,
|
2383
|
+
self.date_format,
|
2384
|
+
self.logger,
|
2385
|
+
bundle=self.bundle,
|
2386
|
+
generate_cyclical_features=self.generate_search_key_features,
|
2387
|
+
)
|
2364
2388
|
df = converter.convert(df, keep_time=True)
|
2365
2389
|
self.logger.info(f"Date column after convertion: {df[date_column]}")
|
2366
2390
|
generated_features.extend(converter.generated_features)
|
@@ -2370,7 +2394,7 @@ if response.status_code == 200:
|
|
2370
2394
|
df = self._add_current_date_as_key(df, search_keys, self.logger, self.bundle)
|
2371
2395
|
|
2372
2396
|
email_columns = SearchKey.find_all_keys(search_keys, SearchKey.EMAIL)
|
2373
|
-
if email_columns:
|
2397
|
+
if email_columns and self.generate_search_key_features:
|
2374
2398
|
generator = EmailDomainGenerator(email_columns)
|
2375
2399
|
df = generator.generate(df)
|
2376
2400
|
generated_features.extend(generator.generated_features)
|
@@ -2379,6 +2403,17 @@ if response.status_code == 200:
|
|
2379
2403
|
df, search_keys, generated_features = normalizer.normalize(df, search_keys, generated_features)
|
2380
2404
|
columns_renaming = normalizer.columns_renaming
|
2381
2405
|
|
2406
|
+
# If there are no external features, we don't call backend on transform
|
2407
|
+
external_features = [fm for fm in features_meta if fm.shap_value > 0 and fm.source != "etalon"]
|
2408
|
+
if not external_features:
|
2409
|
+
self.logger.warning(
|
2410
|
+
"No external features found, returning original dataframe"
|
2411
|
+
f" with generated important features: {filtered_columns}"
|
2412
|
+
)
|
2413
|
+
filtered_columns = [c for c in filtered_columns if c in df.columns]
|
2414
|
+
self.logger.warning(f"Filtered columns by existance in dataframe: {filtered_columns}")
|
2415
|
+
return df[filtered_columns], columns_renaming, generated_features, search_keys
|
2416
|
+
|
2382
2417
|
# Don't pass all features in backend on transform
|
2383
2418
|
runtime_parameters = self._get_copy_of_runtime_parameters()
|
2384
2419
|
features_for_transform = self._search_task.get_features_for_transform() or []
|
@@ -2423,6 +2458,8 @@ if response.status_code == 200:
|
|
2423
2458
|
# Explode multiple search keys
|
2424
2459
|
df, unnest_search_keys = self._explode_multiple_search_keys(df, search_keys, columns_renaming)
|
2425
2460
|
|
2461
|
+
# Convert search keys and generate features on them
|
2462
|
+
|
2426
2463
|
email_column = self._get_email_column(search_keys)
|
2427
2464
|
hem_column = self._get_hem_column(search_keys)
|
2428
2465
|
if email_column:
|
@@ -2611,17 +2648,15 @@ if response.status_code == 200:
|
|
2611
2648
|
how="left",
|
2612
2649
|
)
|
2613
2650
|
|
2651
|
+
selected_generated_features = [
|
2652
|
+
c for c in generated_features if not self.fit_select_features or c in filtered_columns
|
2653
|
+
]
|
2614
2654
|
selecting_columns = [
|
2615
2655
|
c
|
2616
|
-
for c in itertools.chain(validated_Xy.columns.tolist(),
|
2617
|
-
if c not in self.
|
2656
|
+
for c in itertools.chain(validated_Xy.columns.tolist(), selected_generated_features)
|
2657
|
+
if c not in self.zero_shap_client_features
|
2618
2658
|
]
|
2619
|
-
filtered_columns
|
2620
|
-
importance_threshold, max_features, trace_id, validated_X
|
2621
|
-
)
|
2622
|
-
selecting_columns.extend(
|
2623
|
-
c for c in filtered_columns if c in result.columns and c not in validated_X.columns
|
2624
|
-
)
|
2659
|
+
selecting_columns.extend(c for c in result.columns if c in filtered_columns and c not in selecting_columns)
|
2625
2660
|
if add_fit_system_record_id:
|
2626
2661
|
selecting_columns.append(SORT_ID)
|
2627
2662
|
|
@@ -2860,6 +2895,7 @@ if response.status_code == 200:
|
|
2860
2895
|
self.date_format,
|
2861
2896
|
self.logger,
|
2862
2897
|
bundle=self.bundle,
|
2898
|
+
generate_cyclical_features=self.generate_search_key_features,
|
2863
2899
|
)
|
2864
2900
|
df = converter.convert(df, keep_time=True)
|
2865
2901
|
if converter.has_old_dates:
|
@@ -2872,7 +2908,7 @@ if response.status_code == 200:
|
|
2872
2908
|
df = self._add_current_date_as_key(df, self.fit_search_keys, self.logger, self.bundle)
|
2873
2909
|
|
2874
2910
|
email_columns = SearchKey.find_all_keys(self.fit_search_keys, SearchKey.EMAIL)
|
2875
|
-
if email_columns:
|
2911
|
+
if email_columns and self.generate_search_key_features:
|
2876
2912
|
generator = EmailDomainGenerator(email_columns)
|
2877
2913
|
df = generator.generate(df)
|
2878
2914
|
self.fit_generated_features.extend(generator.generated_features)
|
@@ -2920,7 +2956,10 @@ if response.status_code == 200:
|
|
2920
2956
|
self.__log_warning(fintech_warning)
|
2921
2957
|
df, full_duplicates_warning = clean_full_duplicates(df, self.logger, bundle=self.bundle)
|
2922
2958
|
if full_duplicates_warning:
|
2923
|
-
|
2959
|
+
if len(df) == 0:
|
2960
|
+
raise ValidationError(full_duplicates_warning)
|
2961
|
+
else:
|
2962
|
+
self.__log_warning(full_duplicates_warning)
|
2924
2963
|
|
2925
2964
|
# Explode multiple search keys
|
2926
2965
|
df = self.__add_fit_system_record_id(
|
@@ -3323,9 +3362,13 @@ if response.status_code == 200:
|
|
3323
3362
|
Xy[TARGET] = y
|
3324
3363
|
validated_y = Xy[TARGET].copy()
|
3325
3364
|
|
3326
|
-
|
3365
|
+
y_nunique = validated_y.nunique()
|
3366
|
+
if y_nunique < 2:
|
3327
3367
|
raise ValidationError(self.bundle.get("y_is_constant"))
|
3328
3368
|
|
3369
|
+
if self.model_task_type == ModelTaskType.BINARY and y_nunique != 2:
|
3370
|
+
raise ValidationError(self.bundle.get("binary_target_unique_count_not_2").format(y_nunique))
|
3371
|
+
|
3329
3372
|
return validated_y
|
3330
3373
|
|
3331
3374
|
def _validate_eval_set_pair(self, X: pd.DataFrame, eval_pair: Tuple) -> Tuple[pd.DataFrame, pd.Series]:
|
@@ -3400,9 +3443,13 @@ if response.status_code == 200:
|
|
3400
3443
|
else:
|
3401
3444
|
raise ValidationError(self.bundle.get("unsupported_y_type_eval_set").format(type(eval_y)))
|
3402
3445
|
|
3403
|
-
|
3446
|
+
eval_y_nunique = validated_eval_y.nunique()
|
3447
|
+
if eval_y_nunique < 2:
|
3404
3448
|
raise ValidationError(self.bundle.get("y_is_constant_eval_set"))
|
3405
3449
|
|
3450
|
+
if self.model_task_type == ModelTaskType.BINARY and eval_y_nunique != 2:
|
3451
|
+
raise ValidationError(self.bundle.get("binary_target_eval_unique_count_not_2").format(eval_y_nunique))
|
3452
|
+
|
3406
3453
|
return validated_eval_X, validated_eval_y
|
3407
3454
|
|
3408
3455
|
def _validate_baseline_score(self, X: pd.DataFrame, eval_set: Optional[List[Tuple]]):
|
@@ -3564,7 +3611,9 @@ if response.status_code == 200:
|
|
3564
3611
|
maybe_date_col = SearchKey.find_key(self.search_keys, [SearchKey.DATE, SearchKey.DATETIME])
|
3565
3612
|
if X is not None and maybe_date_col is not None and maybe_date_col in X.columns:
|
3566
3613
|
# TODO cast date column to single dtype
|
3567
|
-
date_converter = DateTimeSearchKeyConverter(
|
3614
|
+
date_converter = DateTimeSearchKeyConverter(
|
3615
|
+
maybe_date_col, self.date_format, generate_cyclical_features=False
|
3616
|
+
)
|
3568
3617
|
converted_X = date_converter.convert(X)
|
3569
3618
|
min_date = converted_X[maybe_date_col].min()
|
3570
3619
|
max_date = converted_X[maybe_date_col].max()
|
@@ -3603,7 +3652,7 @@ if response.status_code == 200:
|
|
3603
3652
|
self.__log_warning(bundle.get("current_date_added"))
|
3604
3653
|
df[FeaturesEnricher.CURRENT_DATE] = datetime.date.today()
|
3605
3654
|
search_keys[FeaturesEnricher.CURRENT_DATE] = SearchKey.DATE
|
3606
|
-
converter = DateTimeSearchKeyConverter(FeaturesEnricher.CURRENT_DATE)
|
3655
|
+
converter = DateTimeSearchKeyConverter(FeaturesEnricher.CURRENT_DATE, generate_cyclical_features=False)
|
3607
3656
|
df = converter.convert(df)
|
3608
3657
|
return df
|
3609
3658
|
|
@@ -3942,10 +3991,11 @@ if response.status_code == 200:
|
|
3942
3991
|
original_names_dict = {c.name: c.originalName for c in self._search_task.get_file_metadata(trace_id).columns}
|
3943
3992
|
features_df = self._search_task.get_all_initial_raw_features(trace_id, metrics_calculation=True)
|
3944
3993
|
|
3994
|
+
# To be sure that names with hash suffixes
|
3945
3995
|
df = df.rename(columns=original_names_dict)
|
3946
3996
|
|
3947
3997
|
self.feature_names_ = []
|
3948
|
-
self.
|
3998
|
+
self.zero_shap_client_features = []
|
3949
3999
|
self.feature_importances_ = []
|
3950
4000
|
features_info = []
|
3951
4001
|
features_info_without_links = []
|
@@ -3957,7 +4007,7 @@ if response.status_code == 200:
|
|
3957
4007
|
if feature_meta.name in original_names_dict.keys():
|
3958
4008
|
feature_meta.name = original_names_dict[feature_meta.name]
|
3959
4009
|
|
3960
|
-
is_client_feature = feature_meta.name in df.columns
|
4010
|
+
is_client_feature = original_names_dict.get(feature_meta.name, feature_meta.name) in df.columns
|
3961
4011
|
|
3962
4012
|
# Show and update shap values for client features only if select_features is True
|
3963
4013
|
if updated_shaps is not None and (not is_client_feature or self.fit_select_features):
|
@@ -3973,13 +4023,13 @@ if response.status_code == 200:
|
|
3973
4023
|
features_meta.sort(key=lambda m: (-m.shap_value, m.name))
|
3974
4024
|
|
3975
4025
|
for feature_meta in features_meta:
|
3976
|
-
|
3977
|
-
is_client_feature =
|
4026
|
+
original_name = original_names_dict.get(feature_meta.name, feature_meta.name)
|
4027
|
+
is_client_feature = original_name in df.columns
|
3978
4028
|
|
3979
4029
|
# TODO make a decision about selected features based on special flag from mlb
|
3980
4030
|
if original_shaps.get(feature_meta.name, 0.0) == 0.0:
|
3981
|
-
if self.fit_select_features:
|
3982
|
-
self.
|
4031
|
+
if is_client_feature and self.fit_select_features:
|
4032
|
+
self.zero_shap_client_features.append(original_name)
|
3983
4033
|
continue
|
3984
4034
|
|
3985
4035
|
# Use only important features
|