upgini 1.2.29a3__tar.gz → 1.2.29a5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- {upgini-1.2.29a3 → upgini-1.2.29a5}/PKG-INFO +1 -1
- upgini-1.2.29a5/src/upgini/__about__.py +1 -0
- {upgini-1.2.29a3 → upgini-1.2.29a5}/src/upgini/features_enricher.py +38 -117
- {upgini-1.2.29a3 → upgini-1.2.29a5}/src/upgini/metrics.py +33 -9
- {upgini-1.2.29a3 → upgini-1.2.29a5}/src/upgini/utils/datetime_utils.py +2 -1
- upgini-1.2.29a5/src/upgini/utils/feature_info.py +172 -0
- upgini-1.2.29a3/src/upgini/__about__.py +0 -1
- {upgini-1.2.29a3 → upgini-1.2.29a5}/.gitignore +0 -0
- {upgini-1.2.29a3 → upgini-1.2.29a5}/LICENSE +0 -0
- {upgini-1.2.29a3 → upgini-1.2.29a5}/README.md +0 -0
- {upgini-1.2.29a3 → upgini-1.2.29a5}/pyproject.toml +0 -0
- {upgini-1.2.29a3 → upgini-1.2.29a5}/src/upgini/__init__.py +0 -0
- {upgini-1.2.29a3 → upgini-1.2.29a5}/src/upgini/ads.py +0 -0
- {upgini-1.2.29a3 → upgini-1.2.29a5}/src/upgini/ads_management/__init__.py +0 -0
- {upgini-1.2.29a3 → upgini-1.2.29a5}/src/upgini/ads_management/ads_manager.py +0 -0
- {upgini-1.2.29a3 → upgini-1.2.29a5}/src/upgini/autofe/__init__.py +0 -0
- {upgini-1.2.29a3 → upgini-1.2.29a5}/src/upgini/autofe/all_operands.py +0 -0
- {upgini-1.2.29a3 → upgini-1.2.29a5}/src/upgini/autofe/binary.py +0 -0
- {upgini-1.2.29a3 → upgini-1.2.29a5}/src/upgini/autofe/date.py +0 -0
- {upgini-1.2.29a3 → upgini-1.2.29a5}/src/upgini/autofe/feature.py +0 -0
- {upgini-1.2.29a3 → upgini-1.2.29a5}/src/upgini/autofe/groupby.py +0 -0
- {upgini-1.2.29a3 → upgini-1.2.29a5}/src/upgini/autofe/operand.py +0 -0
- {upgini-1.2.29a3 → upgini-1.2.29a5}/src/upgini/autofe/unary.py +0 -0
- {upgini-1.2.29a3 → upgini-1.2.29a5}/src/upgini/autofe/vector.py +0 -0
- {upgini-1.2.29a3 → upgini-1.2.29a5}/src/upgini/data_source/__init__.py +0 -0
- {upgini-1.2.29a3 → upgini-1.2.29a5}/src/upgini/data_source/data_source_publisher.py +0 -0
- {upgini-1.2.29a3 → upgini-1.2.29a5}/src/upgini/dataset.py +0 -0
- {upgini-1.2.29a3 → upgini-1.2.29a5}/src/upgini/errors.py +0 -0
- {upgini-1.2.29a3 → upgini-1.2.29a5}/src/upgini/http.py +0 -0
- {upgini-1.2.29a3 → upgini-1.2.29a5}/src/upgini/lazy_import.py +0 -0
- {upgini-1.2.29a3 → upgini-1.2.29a5}/src/upgini/mdc/__init__.py +0 -0
- {upgini-1.2.29a3 → upgini-1.2.29a5}/src/upgini/mdc/context.py +0 -0
- {upgini-1.2.29a3 → upgini-1.2.29a5}/src/upgini/metadata.py +0 -0
- {upgini-1.2.29a3 → upgini-1.2.29a5}/src/upgini/normalizer/__init__.py +0 -0
- {upgini-1.2.29a3 → upgini-1.2.29a5}/src/upgini/normalizer/normalize_utils.py +0 -0
- {upgini-1.2.29a3 → upgini-1.2.29a5}/src/upgini/resource_bundle/__init__.py +0 -0
- {upgini-1.2.29a3 → upgini-1.2.29a5}/src/upgini/resource_bundle/exceptions.py +0 -0
- {upgini-1.2.29a3 → upgini-1.2.29a5}/src/upgini/resource_bundle/strings.properties +0 -0
- {upgini-1.2.29a3 → upgini-1.2.29a5}/src/upgini/resource_bundle/strings_widget.properties +0 -0
- {upgini-1.2.29a3 → upgini-1.2.29a5}/src/upgini/sampler/__init__.py +0 -0
- {upgini-1.2.29a3 → upgini-1.2.29a5}/src/upgini/sampler/base.py +0 -0
- {upgini-1.2.29a3 → upgini-1.2.29a5}/src/upgini/sampler/random_under_sampler.py +0 -0
- {upgini-1.2.29a3 → upgini-1.2.29a5}/src/upgini/sampler/utils.py +0 -0
- {upgini-1.2.29a3 → upgini-1.2.29a5}/src/upgini/search_task.py +0 -0
- {upgini-1.2.29a3 → upgini-1.2.29a5}/src/upgini/spinner.py +0 -0
- {upgini-1.2.29a3 → upgini-1.2.29a5}/src/upgini/utils/Roboto-Regular.ttf +0 -0
- {upgini-1.2.29a3 → upgini-1.2.29a5}/src/upgini/utils/__init__.py +0 -0
- {upgini-1.2.29a3 → upgini-1.2.29a5}/src/upgini/utils/base_search_key_detector.py +0 -0
- {upgini-1.2.29a3 → upgini-1.2.29a5}/src/upgini/utils/blocked_time_series.py +0 -0
- {upgini-1.2.29a3 → upgini-1.2.29a5}/src/upgini/utils/country_utils.py +0 -0
- {upgini-1.2.29a3 → upgini-1.2.29a5}/src/upgini/utils/custom_loss_utils.py +0 -0
- {upgini-1.2.29a3 → upgini-1.2.29a5}/src/upgini/utils/cv_utils.py +0 -0
- {upgini-1.2.29a3 → upgini-1.2.29a5}/src/upgini/utils/deduplicate_utils.py +0 -0
- {upgini-1.2.29a3 → upgini-1.2.29a5}/src/upgini/utils/display_utils.py +0 -0
- {upgini-1.2.29a3 → upgini-1.2.29a5}/src/upgini/utils/email_utils.py +0 -0
- {upgini-1.2.29a3 → upgini-1.2.29a5}/src/upgini/utils/fallback_progress_bar.py +0 -0
- {upgini-1.2.29a3 → upgini-1.2.29a5}/src/upgini/utils/features_validator.py +0 -0
- {upgini-1.2.29a3 → upgini-1.2.29a5}/src/upgini/utils/format.py +0 -0
- {upgini-1.2.29a3 → upgini-1.2.29a5}/src/upgini/utils/ip_utils.py +0 -0
- {upgini-1.2.29a3 → upgini-1.2.29a5}/src/upgini/utils/phone_utils.py +0 -0
- {upgini-1.2.29a3 → upgini-1.2.29a5}/src/upgini/utils/postal_code_utils.py +0 -0
- {upgini-1.2.29a3 → upgini-1.2.29a5}/src/upgini/utils/progress_bar.py +0 -0
- {upgini-1.2.29a3 → upgini-1.2.29a5}/src/upgini/utils/sklearn_ext.py +0 -0
- {upgini-1.2.29a3 → upgini-1.2.29a5}/src/upgini/utils/target_utils.py +0 -0
- {upgini-1.2.29a3 → upgini-1.2.29a5}/src/upgini/utils/track_info.py +0 -0
- {upgini-1.2.29a3 → upgini-1.2.29a5}/src/upgini/utils/warning_counter.py +0 -0
- {upgini-1.2.29a3 → upgini-1.2.29a5}/src/upgini/version_validator.py +0 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "1.2.29a5"
|
|
@@ -2,7 +2,6 @@ import dataclasses
|
|
|
2
2
|
import datetime
|
|
3
3
|
import gc
|
|
4
4
|
import hashlib
|
|
5
|
-
import itertools
|
|
6
5
|
import logging
|
|
7
6
|
import numbers
|
|
8
7
|
import os
|
|
@@ -54,6 +53,7 @@ from upgini.metadata import (
|
|
|
54
53
|
SYSTEM_RECORD_ID,
|
|
55
54
|
TARGET,
|
|
56
55
|
CVType,
|
|
56
|
+
FeaturesMetadataV2,
|
|
57
57
|
FileColumnMeaningType,
|
|
58
58
|
ModelTaskType,
|
|
59
59
|
RuntimeParameters,
|
|
@@ -95,6 +95,7 @@ from upgini.utils.email_utils import (
|
|
|
95
95
|
EmailSearchKeyConverter,
|
|
96
96
|
EmailSearchKeyDetector,
|
|
97
97
|
)
|
|
98
|
+
from upgini.utils.feature_info import FeatureInfo, _round_shap_value
|
|
98
99
|
from upgini.utils.features_validator import FeaturesValidator
|
|
99
100
|
from upgini.utils.format import Format
|
|
100
101
|
from upgini.utils.ip_utils import IpSearchKeyConverter
|
|
@@ -224,6 +225,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
224
225
|
client_visitorid: Optional[str] = None,
|
|
225
226
|
custom_bundle_config: Optional[str] = None,
|
|
226
227
|
add_date_if_missing: bool = True,
|
|
228
|
+
select_features: bool = False,
|
|
227
229
|
**kwargs,
|
|
228
230
|
):
|
|
229
231
|
self.bundle = get_custom_bundle(custom_bundle_config)
|
|
@@ -277,8 +279,11 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
277
279
|
self._relevant_data_sources_wo_links: pd.DataFrame = self.EMPTY_DATA_SOURCES
|
|
278
280
|
self.metrics: Optional[pd.DataFrame] = None
|
|
279
281
|
self.feature_names_ = []
|
|
282
|
+
self.client_feature_names_ = []
|
|
280
283
|
self.feature_importances_ = []
|
|
281
284
|
self.search_id = search_id
|
|
285
|
+
self.select_features = select_features
|
|
286
|
+
|
|
282
287
|
if search_id:
|
|
283
288
|
search_task = SearchTask(search_id, rest_client=self.rest_client, logger=self.logger)
|
|
284
289
|
|
|
@@ -999,9 +1004,10 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
999
1004
|
text_features=self.generate_features,
|
|
1000
1005
|
has_date=has_date,
|
|
1001
1006
|
)
|
|
1002
|
-
|
|
1007
|
+
etalon_cv_result = baseline_estimator.cross_val_predict(
|
|
1003
1008
|
fitting_X, y_sorted, self.baseline_score_column
|
|
1004
1009
|
)
|
|
1010
|
+
etalon_metric = etalon_cv_result.get_display_metric()
|
|
1005
1011
|
if etalon_metric is None:
|
|
1006
1012
|
self.logger.info(
|
|
1007
1013
|
f"Baseline {metric} on train client features is None (maybe all features was removed)"
|
|
@@ -1033,9 +1039,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1033
1039
|
text_features=self.generate_features,
|
|
1034
1040
|
has_date=has_date,
|
|
1035
1041
|
)
|
|
1036
|
-
|
|
1037
|
-
|
|
1038
|
-
|
|
1042
|
+
enriched_cv_result = enriched_estimator.cross_val_predict(fitting_enriched_X, enriched_y_sorted)
|
|
1043
|
+
enriched_metric = enriched_cv_result.get_display_metric()
|
|
1044
|
+
enriched_shaps = enriched_cv_result.shap_values
|
|
1039
1045
|
|
|
1040
1046
|
if enriched_shaps is not None:
|
|
1041
1047
|
self._update_shap_values(enriched_shaps)
|
|
@@ -1048,7 +1054,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1048
1054
|
else:
|
|
1049
1055
|
self.logger.info(f"Enriched {metric} on train combined features: {enriched_metric}")
|
|
1050
1056
|
if etalon_metric is not None and enriched_metric is not None:
|
|
1051
|
-
uplift = (
|
|
1057
|
+
uplift = (enriched_cv_result.metric - etalon_cv_result.metric) * multiplier
|
|
1052
1058
|
|
|
1053
1059
|
train_metrics = {
|
|
1054
1060
|
self.bundle.get("quality_metrics_segment_header"): self.bundle.get(
|
|
@@ -1091,9 +1097,10 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1091
1097
|
f"Calculate baseline {metric} on eval set {idx + 1} "
|
|
1092
1098
|
f"on client features: {eval_X_sorted.columns.to_list()}"
|
|
1093
1099
|
)
|
|
1094
|
-
|
|
1100
|
+
etalon_eval_results = baseline_estimator.calculate_metric(
|
|
1095
1101
|
eval_X_sorted, eval_y_sorted, self.baseline_score_column
|
|
1096
1102
|
)
|
|
1103
|
+
etalon_eval_metric = etalon_eval_results.get_display_metric()
|
|
1097
1104
|
self.logger.info(
|
|
1098
1105
|
f"Baseline {metric} on eval set {idx + 1} client features: {etalon_eval_metric}"
|
|
1099
1106
|
)
|
|
@@ -1105,9 +1112,10 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1105
1112
|
f"Calculate enriched {metric} on eval set {idx + 1} "
|
|
1106
1113
|
f"on combined features: {enriched_eval_X_sorted.columns.to_list()}"
|
|
1107
1114
|
)
|
|
1108
|
-
|
|
1115
|
+
enriched_eval_results = enriched_estimator.calculate_metric(
|
|
1109
1116
|
enriched_eval_X_sorted, enriched_eval_y_sorted
|
|
1110
1117
|
)
|
|
1118
|
+
enriched_eval_metric = enriched_eval_results.get_display_metric()
|
|
1111
1119
|
self.logger.info(
|
|
1112
1120
|
f"Enriched {metric} on eval set {idx + 1} combined features: {enriched_eval_metric}"
|
|
1113
1121
|
)
|
|
@@ -1115,7 +1123,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1115
1123
|
enriched_eval_metric = None
|
|
1116
1124
|
|
|
1117
1125
|
if etalon_eval_metric is not None and enriched_eval_metric is not None:
|
|
1118
|
-
eval_uplift = (
|
|
1126
|
+
eval_uplift = (enriched_eval_results.metric - etalon_eval_results.metric) * multiplier
|
|
1119
1127
|
else:
|
|
1120
1128
|
eval_uplift = None
|
|
1121
1129
|
|
|
@@ -1198,9 +1206,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1198
1206
|
|
|
1199
1207
|
def _update_shap_values(self, new_shaps: Dict[str, float]):
|
|
1200
1208
|
new_shaps = {
|
|
1201
|
-
feature:
|
|
1202
|
-
for feature, shap in new_shaps.items()
|
|
1203
|
-
if feature in self.feature_names_
|
|
1209
|
+
feature: _round_shap_value(shap) for feature, shap in new_shaps.items() if feature in self.feature_names_
|
|
1204
1210
|
}
|
|
1205
1211
|
features_importances = list(new_shaps.items())
|
|
1206
1212
|
features_importances.sort(key=lambda m: (-m[1], m[0]))
|
|
@@ -1437,7 +1443,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1437
1443
|
client_features = [
|
|
1438
1444
|
c
|
|
1439
1445
|
for c in X_sampled.columns.to_list()
|
|
1440
|
-
if c
|
|
1446
|
+
if (not self.select_features or c in self.feature_names_)
|
|
1447
|
+
and c
|
|
1441
1448
|
not in (
|
|
1442
1449
|
excluding_search_keys
|
|
1443
1450
|
+ list(self.fit_dropped_features)
|
|
@@ -2063,7 +2070,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2063
2070
|
|
|
2064
2071
|
is_demo_dataset = hash_input(validated_X) in DEMO_DATASET_HASHES
|
|
2065
2072
|
|
|
2066
|
-
columns_to_drop = [
|
|
2073
|
+
columns_to_drop = [
|
|
2074
|
+
c for c in validated_X.columns if c in self.feature_names_ and c not in self.client_feature_names_
|
|
2075
|
+
]
|
|
2067
2076
|
if len(columns_to_drop) > 0:
|
|
2068
2077
|
msg = self.bundle.get("x_contains_enriching_columns").format(columns_to_drop)
|
|
2069
2078
|
self.logger.warning(msg)
|
|
@@ -3490,15 +3499,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3490
3499
|
|
|
3491
3500
|
return result_train, result_eval_sets
|
|
3492
3501
|
|
|
3493
|
-
@staticmethod
|
|
3494
|
-
def _round_shap_value(shap: float) -> float:
|
|
3495
|
-
if shap > 0.0 and shap < 0.0001:
|
|
3496
|
-
return 0.0001
|
|
3497
|
-
else:
|
|
3498
|
-
return round(shap, 4)
|
|
3499
|
-
|
|
3500
3502
|
def __prepare_feature_importances(self, trace_id: str, x_columns: List[str], silent=False):
|
|
3501
|
-
llm_source = "LLM with external data augmentation"
|
|
3502
3503
|
if self._search_task is None:
|
|
3503
3504
|
raise NotFittedError(self.bundle.get("transform_unfitted_enricher"))
|
|
3504
3505
|
features_meta = self._search_task.get_all_features_metadata_v2()
|
|
@@ -3509,116 +3510,36 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3509
3510
|
features_df = self._search_task.get_all_initial_raw_features(trace_id, metrics_calculation=True)
|
|
3510
3511
|
|
|
3511
3512
|
self.feature_names_ = []
|
|
3513
|
+
self.client_feature_names_ = []
|
|
3512
3514
|
self.feature_importances_ = []
|
|
3513
3515
|
features_info = []
|
|
3514
3516
|
features_info_without_links = []
|
|
3515
3517
|
internal_features_info = []
|
|
3516
3518
|
|
|
3517
|
-
def list_or_single(lst: List[str], single: str):
|
|
3518
|
-
return lst or ([single] if single else [])
|
|
3519
|
-
|
|
3520
|
-
def to_anchor(link: str, value: str) -> str:
|
|
3521
|
-
if not value:
|
|
3522
|
-
return ""
|
|
3523
|
-
elif not link:
|
|
3524
|
-
return value
|
|
3525
|
-
elif value == llm_source:
|
|
3526
|
-
return value
|
|
3527
|
-
else:
|
|
3528
|
-
return f"<a href='{link}' target='_blank' rel='noopener noreferrer'>{value}</a>"
|
|
3529
|
-
|
|
3530
|
-
def make_links(names: List[str], links: List[str]):
|
|
3531
|
-
all_links = [to_anchor(link, name) for name, link in itertools.zip_longest(names, links)]
|
|
3532
|
-
return ",".join(all_links)
|
|
3533
|
-
|
|
3534
3519
|
features_meta.sort(key=lambda m: (-m.shap_value, m.name))
|
|
3535
3520
|
for feature_meta in features_meta:
|
|
3536
3521
|
if feature_meta.name in original_names_dict.keys():
|
|
3537
3522
|
feature_meta.name = original_names_dict[feature_meta.name]
|
|
3538
|
-
# Use only
|
|
3523
|
+
# Use only important features
|
|
3539
3524
|
if (
|
|
3540
|
-
feature_meta.
|
|
3541
|
-
or feature_meta.name
|
|
3542
|
-
or feature_meta.
|
|
3543
|
-
or feature_meta.name in self.fit_generated_features
|
|
3525
|
+
(feature_meta.shap_value == 0.0)
|
|
3526
|
+
or (feature_meta.name in self.fit_generated_features)
|
|
3527
|
+
or (feature_meta.name == COUNTRY)
|
|
3544
3528
|
):
|
|
3545
3529
|
continue
|
|
3546
3530
|
|
|
3547
|
-
|
|
3548
|
-
|
|
3549
|
-
self.
|
|
3550
|
-
|
|
3551
|
-
feature_sample = np.random.choice(features_df[feature_meta.name].dropna().unique(), 3).tolist()
|
|
3552
|
-
if len(feature_sample) > 0 and isinstance(feature_sample[0], float):
|
|
3553
|
-
feature_sample = [round(f, 4) for f in feature_sample]
|
|
3554
|
-
feature_sample = [str(f) for f in feature_sample]
|
|
3555
|
-
feature_sample = ", ".join(feature_sample)
|
|
3556
|
-
if len(feature_sample) > 30:
|
|
3557
|
-
feature_sample = feature_sample[:30] + "..."
|
|
3558
|
-
|
|
3559
|
-
internal_provider = feature_meta.data_provider or "Upgini"
|
|
3560
|
-
providers = list_or_single(feature_meta.data_providers, feature_meta.data_provider)
|
|
3561
|
-
provider_links = list_or_single(feature_meta.data_provider_links, feature_meta.data_provider_link)
|
|
3562
|
-
if providers:
|
|
3563
|
-
provider = make_links(providers, provider_links)
|
|
3564
|
-
else:
|
|
3565
|
-
provider = to_anchor("https://upgini.com", "Upgini")
|
|
3531
|
+
is_client_feature = feature_meta.name in x_columns
|
|
3532
|
+
# In select_features mode we select also from etalon features and need to show them
|
|
3533
|
+
if not self.select_features and is_client_feature:
|
|
3534
|
+
continue
|
|
3566
3535
|
|
|
3567
|
-
|
|
3568
|
-
|
|
3569
|
-
if not feature_meta.name.endswith("_country") and not feature_meta.name.endswith("_postal_code")
|
|
3570
|
-
else ""
|
|
3571
|
-
)
|
|
3572
|
-
sources = list_or_single(feature_meta.data_sources, feature_meta.data_source)
|
|
3573
|
-
source_links = list_or_single(feature_meta.data_source_links, feature_meta.data_source_link)
|
|
3574
|
-
if sources:
|
|
3575
|
-
source = make_links(sources, source_links)
|
|
3576
|
-
else:
|
|
3577
|
-
source = internal_source
|
|
3536
|
+
self.feature_names_.append(feature_meta.name)
|
|
3537
|
+
self.feature_importances_.append(_round_shap_value(feature_meta.shap_value))
|
|
3578
3538
|
|
|
3579
|
-
|
|
3580
|
-
|
|
3581
|
-
|
|
3582
|
-
|
|
3583
|
-
feature_name = internal_feature_name
|
|
3584
|
-
|
|
3585
|
-
features_info.append(
|
|
3586
|
-
{
|
|
3587
|
-
self.bundle.get("features_info_name"): feature_name,
|
|
3588
|
-
self.bundle.get("features_info_shap"): self._round_shap_value(feature_meta.shap_value),
|
|
3589
|
-
self.bundle.get("features_info_hitrate"): feature_meta.hit_rate,
|
|
3590
|
-
self.bundle.get("features_info_value_preview"): feature_sample,
|
|
3591
|
-
self.bundle.get("features_info_provider"): provider,
|
|
3592
|
-
self.bundle.get("features_info_source"): source,
|
|
3593
|
-
self.bundle.get("features_info_update_frequency"): feature_meta.update_frequency,
|
|
3594
|
-
}
|
|
3595
|
-
)
|
|
3596
|
-
features_info_without_links.append(
|
|
3597
|
-
{
|
|
3598
|
-
self.bundle.get("features_info_name"): internal_feature_name,
|
|
3599
|
-
self.bundle.get("features_info_shap"): self._round_shap_value(feature_meta.shap_value),
|
|
3600
|
-
self.bundle.get("features_info_hitrate"): feature_meta.hit_rate,
|
|
3601
|
-
self.bundle.get("features_info_value_preview"): feature_sample,
|
|
3602
|
-
self.bundle.get("features_info_provider"): internal_provider,
|
|
3603
|
-
self.bundle.get("features_info_source"): internal_source,
|
|
3604
|
-
self.bundle.get("features_info_update_frequency"): feature_meta.update_frequency,
|
|
3605
|
-
}
|
|
3606
|
-
)
|
|
3607
|
-
internal_features_info.append(
|
|
3608
|
-
{
|
|
3609
|
-
self.bundle.get("features_info_name"): internal_feature_name,
|
|
3610
|
-
"feature_link": feature_meta.doc_link,
|
|
3611
|
-
self.bundle.get("features_info_shap"): self._round_shap_value(feature_meta.shap_value),
|
|
3612
|
-
self.bundle.get("features_info_hitrate"): feature_meta.hit_rate,
|
|
3613
|
-
self.bundle.get("features_info_value_preview"): feature_sample,
|
|
3614
|
-
self.bundle.get("features_info_provider"): internal_provider,
|
|
3615
|
-
"provider_link": feature_meta.data_provider_link,
|
|
3616
|
-
self.bundle.get("features_info_source"): internal_source,
|
|
3617
|
-
"source_link": feature_meta.data_source_link,
|
|
3618
|
-
self.bundle.get("features_info_commercial_schema"): feature_meta.commercial_schema or "",
|
|
3619
|
-
self.bundle.get("features_info_update_frequency"): feature_meta.update_frequency,
|
|
3620
|
-
}
|
|
3621
|
-
)
|
|
3539
|
+
feature_info = FeatureInfo.from_metadata(feature_meta, features_df, is_client_feature)
|
|
3540
|
+
features_info.append(feature_info.to_row(self.bundle))
|
|
3541
|
+
features_info_without_links.append(feature_info.to_row_without_links(self.bundle))
|
|
3542
|
+
internal_features_info.append(feature_info.to_internal_row(self.bundle))
|
|
3622
3543
|
|
|
3623
3544
|
if len(features_info) > 0:
|
|
3624
3545
|
self.features_info = pd.DataFrame(features_info)
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
+
from dataclasses import dataclass
|
|
3
4
|
import inspect
|
|
4
5
|
import logging
|
|
5
6
|
import re
|
|
@@ -210,6 +211,21 @@ SUPPORTED_CATBOOST_METRICS = {
|
|
|
210
211
|
}
|
|
211
212
|
|
|
212
213
|
|
|
214
|
+
@dataclass
|
|
215
|
+
class _CrossValResults:
|
|
216
|
+
metric: Optional[float]
|
|
217
|
+
metric_std: Optional[float]
|
|
218
|
+
shap_values: Optional[Dict[str, float]]
|
|
219
|
+
|
|
220
|
+
def get_display_metric(self) -> Optional[str]:
|
|
221
|
+
if self.metric is None:
|
|
222
|
+
return None
|
|
223
|
+
elif self.metric_std is None:
|
|
224
|
+
return f"{self.metric:.3f}"
|
|
225
|
+
else:
|
|
226
|
+
return f"{self.metric:.3f} ± {self.metric_std:.3f}"
|
|
227
|
+
|
|
228
|
+
|
|
213
229
|
class EstimatorWrapper:
|
|
214
230
|
def __init__(
|
|
215
231
|
self,
|
|
@@ -297,11 +313,11 @@ class EstimatorWrapper:
|
|
|
297
313
|
|
|
298
314
|
def cross_val_predict(
|
|
299
315
|
self, x: pd.DataFrame, y: np.ndarray, baseline_score_column: Optional[Any] = None
|
|
300
|
-
) ->
|
|
316
|
+
) -> _CrossValResults:
|
|
301
317
|
x, y, groups, fit_params = self._prepare_to_fit(x, y)
|
|
302
318
|
|
|
303
319
|
if x.shape[1] == 0:
|
|
304
|
-
return None
|
|
320
|
+
return _CrossValResults(metric=None, metric_std=None, shap_values=None)
|
|
305
321
|
|
|
306
322
|
scorer = check_scoring(self.estimator, scoring=self.scorer)
|
|
307
323
|
|
|
@@ -326,7 +342,7 @@ class EstimatorWrapper:
|
|
|
326
342
|
|
|
327
343
|
self.check_fold_metrics(metrics_by_fold)
|
|
328
344
|
|
|
329
|
-
metric =
|
|
345
|
+
metric, metric_std = self._calculate_metric_from_folds(metrics_by_fold)
|
|
330
346
|
|
|
331
347
|
splits = self.cv.split(x, y, groups)
|
|
332
348
|
|
|
@@ -351,7 +367,7 @@ class EstimatorWrapper:
|
|
|
351
367
|
else:
|
|
352
368
|
average_shap_values = None
|
|
353
369
|
|
|
354
|
-
return
|
|
370
|
+
return _CrossValResults(metric=metric, metric_std=metric_std, shap_values=average_shap_values)
|
|
355
371
|
|
|
356
372
|
def process_shap_values(self, shap_values: Dict[str, float]) -> Dict[str, float]:
|
|
357
373
|
return shap_values
|
|
@@ -367,17 +383,25 @@ class EstimatorWrapper:
|
|
|
367
383
|
metric = 2 * metric - 1
|
|
368
384
|
return metric
|
|
369
385
|
|
|
370
|
-
def calculate_metric(
|
|
386
|
+
def calculate_metric(
|
|
387
|
+
self, x: pd.DataFrame, y: np.ndarray, baseline_score_column: Optional[Any] = None
|
|
388
|
+
) -> _CrossValResults:
|
|
371
389
|
x, y, _ = self._prepare_to_calculate(x, y)
|
|
372
390
|
if baseline_score_column is not None and self.metric_name == "GINI":
|
|
373
|
-
metric = roc_auc_score(y, x[baseline_score_column])
|
|
391
|
+
metric, metric_std = roc_auc_score(y, x[baseline_score_column]), None
|
|
374
392
|
else:
|
|
375
393
|
metrics = []
|
|
376
394
|
for est in self.cv_estimators:
|
|
377
395
|
metrics.append(self.scorer(est, x, y))
|
|
378
396
|
|
|
379
|
-
metric =
|
|
380
|
-
return
|
|
397
|
+
metric, metric_std = self._calculate_metric_from_folds(metrics)
|
|
398
|
+
return _CrossValResults(metric=metric, metric_std=metric_std, shap_values=None)
|
|
399
|
+
|
|
400
|
+
def _calculate_metric_from_folds(self, metrics_by_fold: List[float]) -> Tuple[float, float]:
|
|
401
|
+
metrics_by_fold = [self.post_process_metric(m) for m in metrics_by_fold]
|
|
402
|
+
metric = np.mean(metrics_by_fold) * self.multiplier
|
|
403
|
+
metric_std = np.std(metrics_by_fold) * np.abs(self.multiplier)
|
|
404
|
+
return metric, metric_std
|
|
381
405
|
|
|
382
406
|
@staticmethod
|
|
383
407
|
def create(
|
|
@@ -591,7 +615,7 @@ class CatBoostWrapper(EstimatorWrapper):
|
|
|
591
615
|
|
|
592
616
|
def cross_val_predict(
|
|
593
617
|
self, x: pd.DataFrame, y: np.ndarray, baseline_score_column: Optional[Any] = None
|
|
594
|
-
) ->
|
|
618
|
+
) -> _CrossValResults:
|
|
595
619
|
try:
|
|
596
620
|
return super().cross_val_predict(x, y, baseline_score_column)
|
|
597
621
|
except Exception as e:
|
|
@@ -111,7 +111,7 @@ class DateTimeSearchKeyConverter:
|
|
|
111
111
|
|
|
112
112
|
# Define function to apply sine and cosine transformations
|
|
113
113
|
def add_cyclical_features(df, column, period):
|
|
114
|
-
period_suffix = f"_{period}" if column !=
|
|
114
|
+
period_suffix = f"_{period}" if column != "day_in_quarter" else ""
|
|
115
115
|
sin_feature = f"datetime_{column}_sin{period_suffix}"
|
|
116
116
|
cos_feature = f"datetime_{column}_cos{period_suffix}"
|
|
117
117
|
df[sin_feature] = np.sin(2 * np.pi * df[column] / period)
|
|
@@ -137,6 +137,7 @@ class DateTimeSearchKeyConverter:
|
|
|
137
137
|
quarter_end_month = np.where(quarter == 4, 1, month + 3)
|
|
138
138
|
|
|
139
139
|
end = pd.to_datetime({"year": quarter_end_year, "month": quarter_end_month, "day": 1})
|
|
140
|
+
end.index = df.index
|
|
140
141
|
|
|
141
142
|
df["days_in_quarter"] = (end - start).dt.days
|
|
142
143
|
|
|
@@ -0,0 +1,172 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
import itertools
|
|
3
|
+
from typing import Dict, List
|
|
4
|
+
|
|
5
|
+
import numpy as np
|
|
6
|
+
import pandas as pd
|
|
7
|
+
|
|
8
|
+
from upgini.metadata import FeaturesMetadataV2
|
|
9
|
+
from upgini.resource_bundle import ResourceBundle
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
LLM_SOURCE = "LLM with external data augmentation"
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@dataclass
|
|
16
|
+
class FeatureInfo:
|
|
17
|
+
name: str
|
|
18
|
+
internal_name: str
|
|
19
|
+
rounded_shap: float
|
|
20
|
+
hitrate: float
|
|
21
|
+
value_preview: str
|
|
22
|
+
provider: str
|
|
23
|
+
internal_provider: str
|
|
24
|
+
source: str
|
|
25
|
+
internal_source: str
|
|
26
|
+
update_frequency: str
|
|
27
|
+
commercial_schema: str
|
|
28
|
+
doc_link: str
|
|
29
|
+
data_provider_link: str
|
|
30
|
+
data_source_link: str
|
|
31
|
+
|
|
32
|
+
@staticmethod
|
|
33
|
+
def from_metadata(feature_meta: FeaturesMetadataV2, data: pd.DataFrame, is_client_feature: bool) -> "FeatureInfo":
|
|
34
|
+
return FeatureInfo(
|
|
35
|
+
name=_get_name(feature_meta),
|
|
36
|
+
internal_name=_get_internal_name(feature_meta),
|
|
37
|
+
rounded_shap=_round_shap_value(feature_meta.shap_value),
|
|
38
|
+
hitrate=feature_meta.hit_rate,
|
|
39
|
+
value_preview=_get_feature_sample(feature_meta, data),
|
|
40
|
+
provider=_get_provider(feature_meta, is_client_feature),
|
|
41
|
+
internal_provider=_get_internal_provider(feature_meta, is_client_feature),
|
|
42
|
+
source=_get_source(feature_meta, is_client_feature),
|
|
43
|
+
internal_source=_get_internal_source(feature_meta, is_client_feature),
|
|
44
|
+
update_frequency=feature_meta.update_frequency,
|
|
45
|
+
commercial_schema=feature_meta.commercial_schema,
|
|
46
|
+
doc_link=feature_meta.doc_link,
|
|
47
|
+
data_provider_link=feature_meta.data_provider_link,
|
|
48
|
+
data_source_link=feature_meta.data_source_link,
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
def to_row(self, bundle: ResourceBundle) -> Dict[str, str]:
|
|
52
|
+
return {
|
|
53
|
+
bundle.get("features_info_name"): self.name,
|
|
54
|
+
bundle.get("features_info_shap"): self.rounded_shap,
|
|
55
|
+
bundle.get("features_info_hitrate"): self.hitrate,
|
|
56
|
+
bundle.get("features_info_value_preview"): self.value_preview,
|
|
57
|
+
bundle.get("features_info_provider"): self.provider,
|
|
58
|
+
bundle.get("features_info_source"): self.source,
|
|
59
|
+
bundle.get("features_info_update_frequency"): self.update_frequency,
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
def to_row_without_links(self, bundle: ResourceBundle) -> Dict[str, str]:
|
|
63
|
+
return {
|
|
64
|
+
bundle.get("features_info_name"): self.internal_name,
|
|
65
|
+
bundle.get("features_info_shap"): self.rounded_shap,
|
|
66
|
+
bundle.get("features_info_hitrate"): self.hitrate,
|
|
67
|
+
bundle.get("features_info_value_preview"): self.value_preview,
|
|
68
|
+
bundle.get("features_info_provider"): self.internal_provider,
|
|
69
|
+
bundle.get("features_info_source"): self.internal_source,
|
|
70
|
+
bundle.get("features_info_update_frequency"): self.update_frequency,
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
def to_internal_row(self, bundle: ResourceBundle) -> Dict[str, str]:
|
|
74
|
+
return {
|
|
75
|
+
bundle.get("features_info_name"): self.internal_name,
|
|
76
|
+
"feature_link": self.doc_link,
|
|
77
|
+
bundle.get("features_info_shap"): self.rounded_shap,
|
|
78
|
+
bundle.get("features_info_hitrate"): self.hitrate,
|
|
79
|
+
bundle.get("features_info_value_preview"): self.value_preview,
|
|
80
|
+
bundle.get("features_info_provider"): self.internal_provider,
|
|
81
|
+
"provider_link": self.data_provider_link,
|
|
82
|
+
bundle.get("features_info_source"): self.internal_source,
|
|
83
|
+
"source_link": self.data_source_link,
|
|
84
|
+
bundle.get("features_info_commercial_schema"): self.commercial_schema or "",
|
|
85
|
+
bundle.get("features_info_update_frequency"): self.update_frequency,
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def _get_feature_sample(feature_meta: FeaturesMetadataV2, data: pd.DataFrame) -> str:
|
|
90
|
+
if feature_meta.name in data.columns:
|
|
91
|
+
feature_sample = np.random.choice(data[feature_meta.name].dropna().unique(), 3).tolist()
|
|
92
|
+
if len(feature_sample) > 0 and isinstance(feature_sample[0], float):
|
|
93
|
+
feature_sample = [round(f, 4) for f in feature_sample]
|
|
94
|
+
feature_sample = [str(f) for f in feature_sample]
|
|
95
|
+
feature_sample = ", ".join(feature_sample)
|
|
96
|
+
if len(feature_sample) > 30:
|
|
97
|
+
feature_sample = feature_sample[:30] + "..."
|
|
98
|
+
else:
|
|
99
|
+
feature_sample = ""
|
|
100
|
+
return feature_sample
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def _get_name(feature_meta: FeaturesMetadataV2) -> str:
|
|
104
|
+
if feature_meta.doc_link:
|
|
105
|
+
return _to_anchor(feature_meta.doc_link, feature_meta.name)
|
|
106
|
+
else:
|
|
107
|
+
return feature_meta.name
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def _get_internal_name(feature_meta: FeaturesMetadataV2) -> str:
|
|
111
|
+
return feature_meta.name
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def _get_provider(feature_meta: FeaturesMetadataV2, is_client_feature: bool) -> str:
|
|
115
|
+
providers = _list_or_single(feature_meta.data_providers, feature_meta.data_provider)
|
|
116
|
+
provider_links = _list_or_single(feature_meta.data_provider_links, feature_meta.data_provider_link)
|
|
117
|
+
if providers:
|
|
118
|
+
provider = _make_links(providers, provider_links)
|
|
119
|
+
else:
|
|
120
|
+
provider = "" if is_client_feature else _to_anchor("https://upgini.com", "Upgini")
|
|
121
|
+
return provider
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def _get_internal_provider(feature_meta: FeaturesMetadataV2, is_client_feature: bool) -> str:
|
|
125
|
+
return "" if is_client_feature else (feature_meta.data_provider or "Upgini")
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def _get_source(feature_meta: FeaturesMetadataV2, is_client_feature: bool) -> str:
|
|
129
|
+
sources = _list_or_single(feature_meta.data_sources, feature_meta.data_source)
|
|
130
|
+
source_links = _list_or_single(feature_meta.data_source_links, feature_meta.data_source_link)
|
|
131
|
+
if sources:
|
|
132
|
+
source = _make_links(sources, source_links)
|
|
133
|
+
else:
|
|
134
|
+
source = _get_internal_source(feature_meta, is_client_feature)
|
|
135
|
+
return source
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
def _get_internal_source(feature_meta: FeaturesMetadataV2, is_client_feature: bool) -> str:
|
|
139
|
+
return feature_meta.data_source or (
|
|
140
|
+
LLM_SOURCE
|
|
141
|
+
if not feature_meta.name.endswith("_country")
|
|
142
|
+
and not feature_meta.name.endswith("_postal_code")
|
|
143
|
+
and not is_client_feature
|
|
144
|
+
else ""
|
|
145
|
+
)
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def _list_or_single(lst: List[str], single: str):
|
|
149
|
+
return lst or ([single] if single else [])
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
def _to_anchor(link: str, value: str) -> str:
|
|
153
|
+
if not value:
|
|
154
|
+
return ""
|
|
155
|
+
elif not link:
|
|
156
|
+
return value
|
|
157
|
+
elif value == LLM_SOURCE:
|
|
158
|
+
return value
|
|
159
|
+
else:
|
|
160
|
+
return f"<a href='{link}' target='_blank' rel='noopener noreferrer'>{value}</a>"
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
def _make_links(names: List[str], links: List[str]):
|
|
164
|
+
all_links = [_to_anchor(link, name) for name, link in itertools.zip_longest(names, links)]
|
|
165
|
+
return ",".join(all_links)
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
def _round_shap_value(shap: float) -> float:
|
|
169
|
+
if shap > 0.0 and shap < 0.0001:
|
|
170
|
+
return 0.0001
|
|
171
|
+
else:
|
|
172
|
+
return round(shap, 4)
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
__version__ = "1.2.29a3"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|