upgini 1.2.29a4__tar.gz → 1.2.29a5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- {upgini-1.2.29a4 → upgini-1.2.29a5}/PKG-INFO +1 -1
- upgini-1.2.29a5/src/upgini/__about__.py +1 -0
- {upgini-1.2.29a4 → upgini-1.2.29a5}/src/upgini/features_enricher.py +27 -109
- {upgini-1.2.29a4 → upgini-1.2.29a5}/src/upgini/utils/datetime_utils.py +2 -1
- upgini-1.2.29a5/src/upgini/utils/feature_info.py +172 -0
- upgini-1.2.29a4/src/upgini/__about__.py +0 -1
- {upgini-1.2.29a4 → upgini-1.2.29a5}/.gitignore +0 -0
- {upgini-1.2.29a4 → upgini-1.2.29a5}/LICENSE +0 -0
- {upgini-1.2.29a4 → upgini-1.2.29a5}/README.md +0 -0
- {upgini-1.2.29a4 → upgini-1.2.29a5}/pyproject.toml +0 -0
- {upgini-1.2.29a4 → upgini-1.2.29a5}/src/upgini/__init__.py +0 -0
- {upgini-1.2.29a4 → upgini-1.2.29a5}/src/upgini/ads.py +0 -0
- {upgini-1.2.29a4 → upgini-1.2.29a5}/src/upgini/ads_management/__init__.py +0 -0
- {upgini-1.2.29a4 → upgini-1.2.29a5}/src/upgini/ads_management/ads_manager.py +0 -0
- {upgini-1.2.29a4 → upgini-1.2.29a5}/src/upgini/autofe/__init__.py +0 -0
- {upgini-1.2.29a4 → upgini-1.2.29a5}/src/upgini/autofe/all_operands.py +0 -0
- {upgini-1.2.29a4 → upgini-1.2.29a5}/src/upgini/autofe/binary.py +0 -0
- {upgini-1.2.29a4 → upgini-1.2.29a5}/src/upgini/autofe/date.py +0 -0
- {upgini-1.2.29a4 → upgini-1.2.29a5}/src/upgini/autofe/feature.py +0 -0
- {upgini-1.2.29a4 → upgini-1.2.29a5}/src/upgini/autofe/groupby.py +0 -0
- {upgini-1.2.29a4 → upgini-1.2.29a5}/src/upgini/autofe/operand.py +0 -0
- {upgini-1.2.29a4 → upgini-1.2.29a5}/src/upgini/autofe/unary.py +0 -0
- {upgini-1.2.29a4 → upgini-1.2.29a5}/src/upgini/autofe/vector.py +0 -0
- {upgini-1.2.29a4 → upgini-1.2.29a5}/src/upgini/data_source/__init__.py +0 -0
- {upgini-1.2.29a4 → upgini-1.2.29a5}/src/upgini/data_source/data_source_publisher.py +0 -0
- {upgini-1.2.29a4 → upgini-1.2.29a5}/src/upgini/dataset.py +0 -0
- {upgini-1.2.29a4 → upgini-1.2.29a5}/src/upgini/errors.py +0 -0
- {upgini-1.2.29a4 → upgini-1.2.29a5}/src/upgini/http.py +0 -0
- {upgini-1.2.29a4 → upgini-1.2.29a5}/src/upgini/lazy_import.py +0 -0
- {upgini-1.2.29a4 → upgini-1.2.29a5}/src/upgini/mdc/__init__.py +0 -0
- {upgini-1.2.29a4 → upgini-1.2.29a5}/src/upgini/mdc/context.py +0 -0
- {upgini-1.2.29a4 → upgini-1.2.29a5}/src/upgini/metadata.py +0 -0
- {upgini-1.2.29a4 → upgini-1.2.29a5}/src/upgini/metrics.py +0 -0
- {upgini-1.2.29a4 → upgini-1.2.29a5}/src/upgini/normalizer/__init__.py +0 -0
- {upgini-1.2.29a4 → upgini-1.2.29a5}/src/upgini/normalizer/normalize_utils.py +0 -0
- {upgini-1.2.29a4 → upgini-1.2.29a5}/src/upgini/resource_bundle/__init__.py +0 -0
- {upgini-1.2.29a4 → upgini-1.2.29a5}/src/upgini/resource_bundle/exceptions.py +0 -0
- {upgini-1.2.29a4 → upgini-1.2.29a5}/src/upgini/resource_bundle/strings.properties +0 -0
- {upgini-1.2.29a4 → upgini-1.2.29a5}/src/upgini/resource_bundle/strings_widget.properties +0 -0
- {upgini-1.2.29a4 → upgini-1.2.29a5}/src/upgini/sampler/__init__.py +0 -0
- {upgini-1.2.29a4 → upgini-1.2.29a5}/src/upgini/sampler/base.py +0 -0
- {upgini-1.2.29a4 → upgini-1.2.29a5}/src/upgini/sampler/random_under_sampler.py +0 -0
- {upgini-1.2.29a4 → upgini-1.2.29a5}/src/upgini/sampler/utils.py +0 -0
- {upgini-1.2.29a4 → upgini-1.2.29a5}/src/upgini/search_task.py +0 -0
- {upgini-1.2.29a4 → upgini-1.2.29a5}/src/upgini/spinner.py +0 -0
- {upgini-1.2.29a4 → upgini-1.2.29a5}/src/upgini/utils/Roboto-Regular.ttf +0 -0
- {upgini-1.2.29a4 → upgini-1.2.29a5}/src/upgini/utils/__init__.py +0 -0
- {upgini-1.2.29a4 → upgini-1.2.29a5}/src/upgini/utils/base_search_key_detector.py +0 -0
- {upgini-1.2.29a4 → upgini-1.2.29a5}/src/upgini/utils/blocked_time_series.py +0 -0
- {upgini-1.2.29a4 → upgini-1.2.29a5}/src/upgini/utils/country_utils.py +0 -0
- {upgini-1.2.29a4 → upgini-1.2.29a5}/src/upgini/utils/custom_loss_utils.py +0 -0
- {upgini-1.2.29a4 → upgini-1.2.29a5}/src/upgini/utils/cv_utils.py +0 -0
- {upgini-1.2.29a4 → upgini-1.2.29a5}/src/upgini/utils/deduplicate_utils.py +0 -0
- {upgini-1.2.29a4 → upgini-1.2.29a5}/src/upgini/utils/display_utils.py +0 -0
- {upgini-1.2.29a4 → upgini-1.2.29a5}/src/upgini/utils/email_utils.py +0 -0
- {upgini-1.2.29a4 → upgini-1.2.29a5}/src/upgini/utils/fallback_progress_bar.py +0 -0
- {upgini-1.2.29a4 → upgini-1.2.29a5}/src/upgini/utils/features_validator.py +0 -0
- {upgini-1.2.29a4 → upgini-1.2.29a5}/src/upgini/utils/format.py +0 -0
- {upgini-1.2.29a4 → upgini-1.2.29a5}/src/upgini/utils/ip_utils.py +0 -0
- {upgini-1.2.29a4 → upgini-1.2.29a5}/src/upgini/utils/phone_utils.py +0 -0
- {upgini-1.2.29a4 → upgini-1.2.29a5}/src/upgini/utils/postal_code_utils.py +0 -0
- {upgini-1.2.29a4 → upgini-1.2.29a5}/src/upgini/utils/progress_bar.py +0 -0
- {upgini-1.2.29a4 → upgini-1.2.29a5}/src/upgini/utils/sklearn_ext.py +0 -0
- {upgini-1.2.29a4 → upgini-1.2.29a5}/src/upgini/utils/target_utils.py +0 -0
- {upgini-1.2.29a4 → upgini-1.2.29a5}/src/upgini/utils/track_info.py +0 -0
- {upgini-1.2.29a4 → upgini-1.2.29a5}/src/upgini/utils/warning_counter.py +0 -0
- {upgini-1.2.29a4 → upgini-1.2.29a5}/src/upgini/version_validator.py +0 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "1.2.29a5"
|
|
@@ -2,7 +2,6 @@ import dataclasses
|
|
|
2
2
|
import datetime
|
|
3
3
|
import gc
|
|
4
4
|
import hashlib
|
|
5
|
-
import itertools
|
|
6
5
|
import logging
|
|
7
6
|
import numbers
|
|
8
7
|
import os
|
|
@@ -54,6 +53,7 @@ from upgini.metadata import (
|
|
|
54
53
|
SYSTEM_RECORD_ID,
|
|
55
54
|
TARGET,
|
|
56
55
|
CVType,
|
|
56
|
+
FeaturesMetadataV2,
|
|
57
57
|
FileColumnMeaningType,
|
|
58
58
|
ModelTaskType,
|
|
59
59
|
RuntimeParameters,
|
|
@@ -95,6 +95,7 @@ from upgini.utils.email_utils import (
|
|
|
95
95
|
EmailSearchKeyConverter,
|
|
96
96
|
EmailSearchKeyDetector,
|
|
97
97
|
)
|
|
98
|
+
from upgini.utils.feature_info import FeatureInfo, _round_shap_value
|
|
98
99
|
from upgini.utils.features_validator import FeaturesValidator
|
|
99
100
|
from upgini.utils.format import Format
|
|
100
101
|
from upgini.utils.ip_utils import IpSearchKeyConverter
|
|
@@ -224,6 +225,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
224
225
|
client_visitorid: Optional[str] = None,
|
|
225
226
|
custom_bundle_config: Optional[str] = None,
|
|
226
227
|
add_date_if_missing: bool = True,
|
|
228
|
+
select_features: bool = False,
|
|
227
229
|
**kwargs,
|
|
228
230
|
):
|
|
229
231
|
self.bundle = get_custom_bundle(custom_bundle_config)
|
|
@@ -277,8 +279,11 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
277
279
|
self._relevant_data_sources_wo_links: pd.DataFrame = self.EMPTY_DATA_SOURCES
|
|
278
280
|
self.metrics: Optional[pd.DataFrame] = None
|
|
279
281
|
self.feature_names_ = []
|
|
282
|
+
self.client_feature_names_ = []
|
|
280
283
|
self.feature_importances_ = []
|
|
281
284
|
self.search_id = search_id
|
|
285
|
+
self.select_features = select_features
|
|
286
|
+
|
|
282
287
|
if search_id:
|
|
283
288
|
search_task = SearchTask(search_id, rest_client=self.rest_client, logger=self.logger)
|
|
284
289
|
|
|
@@ -1201,9 +1206,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1201
1206
|
|
|
1202
1207
|
def _update_shap_values(self, new_shaps: Dict[str, float]):
|
|
1203
1208
|
new_shaps = {
|
|
1204
|
-
feature:
|
|
1205
|
-
for feature, shap in new_shaps.items()
|
|
1206
|
-
if feature in self.feature_names_
|
|
1209
|
+
feature: _round_shap_value(shap) for feature, shap in new_shaps.items() if feature in self.feature_names_
|
|
1207
1210
|
}
|
|
1208
1211
|
features_importances = list(new_shaps.items())
|
|
1209
1212
|
features_importances.sort(key=lambda m: (-m[1], m[0]))
|
|
@@ -1440,7 +1443,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1440
1443
|
client_features = [
|
|
1441
1444
|
c
|
|
1442
1445
|
for c in X_sampled.columns.to_list()
|
|
1443
|
-
if c
|
|
1446
|
+
if (not self.select_features or c in self.feature_names_)
|
|
1447
|
+
and c
|
|
1444
1448
|
not in (
|
|
1445
1449
|
excluding_search_keys
|
|
1446
1450
|
+ list(self.fit_dropped_features)
|
|
@@ -2066,7 +2070,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2066
2070
|
|
|
2067
2071
|
is_demo_dataset = hash_input(validated_X) in DEMO_DATASET_HASHES
|
|
2068
2072
|
|
|
2069
|
-
columns_to_drop = [
|
|
2073
|
+
columns_to_drop = [
|
|
2074
|
+
c for c in validated_X.columns if c in self.feature_names_ and c not in self.client_feature_names_
|
|
2075
|
+
]
|
|
2070
2076
|
if len(columns_to_drop) > 0:
|
|
2071
2077
|
msg = self.bundle.get("x_contains_enriching_columns").format(columns_to_drop)
|
|
2072
2078
|
self.logger.warning(msg)
|
|
@@ -3493,15 +3499,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3493
3499
|
|
|
3494
3500
|
return result_train, result_eval_sets
|
|
3495
3501
|
|
|
3496
|
-
@staticmethod
|
|
3497
|
-
def _round_shap_value(shap: float) -> float:
|
|
3498
|
-
if shap > 0.0 and shap < 0.0001:
|
|
3499
|
-
return 0.0001
|
|
3500
|
-
else:
|
|
3501
|
-
return round(shap, 4)
|
|
3502
|
-
|
|
3503
3502
|
def __prepare_feature_importances(self, trace_id: str, x_columns: List[str], silent=False):
|
|
3504
|
-
llm_source = "LLM with external data augmentation"
|
|
3505
3503
|
if self._search_task is None:
|
|
3506
3504
|
raise NotFittedError(self.bundle.get("transform_unfitted_enricher"))
|
|
3507
3505
|
features_meta = self._search_task.get_all_features_metadata_v2()
|
|
@@ -3512,116 +3510,36 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3512
3510
|
features_df = self._search_task.get_all_initial_raw_features(trace_id, metrics_calculation=True)
|
|
3513
3511
|
|
|
3514
3512
|
self.feature_names_ = []
|
|
3513
|
+
self.client_feature_names_ = []
|
|
3515
3514
|
self.feature_importances_ = []
|
|
3516
3515
|
features_info = []
|
|
3517
3516
|
features_info_without_links = []
|
|
3518
3517
|
internal_features_info = []
|
|
3519
3518
|
|
|
3520
|
-
def list_or_single(lst: List[str], single: str):
|
|
3521
|
-
return lst or ([single] if single else [])
|
|
3522
|
-
|
|
3523
|
-
def to_anchor(link: str, value: str) -> str:
|
|
3524
|
-
if not value:
|
|
3525
|
-
return ""
|
|
3526
|
-
elif not link:
|
|
3527
|
-
return value
|
|
3528
|
-
elif value == llm_source:
|
|
3529
|
-
return value
|
|
3530
|
-
else:
|
|
3531
|
-
return f"<a href='{link}' target='_blank' rel='noopener noreferrer'>{value}</a>"
|
|
3532
|
-
|
|
3533
|
-
def make_links(names: List[str], links: List[str]):
|
|
3534
|
-
all_links = [to_anchor(link, name) for name, link in itertools.zip_longest(names, links)]
|
|
3535
|
-
return ",".join(all_links)
|
|
3536
|
-
|
|
3537
3519
|
features_meta.sort(key=lambda m: (-m.shap_value, m.name))
|
|
3538
3520
|
for feature_meta in features_meta:
|
|
3539
3521
|
if feature_meta.name in original_names_dict.keys():
|
|
3540
3522
|
feature_meta.name = original_names_dict[feature_meta.name]
|
|
3541
|
-
# Use only
|
|
3523
|
+
# Use only important features
|
|
3542
3524
|
if (
|
|
3543
|
-
feature_meta.
|
|
3544
|
-
or feature_meta.name
|
|
3545
|
-
or feature_meta.
|
|
3546
|
-
or feature_meta.name in self.fit_generated_features
|
|
3525
|
+
(feature_meta.shap_value == 0.0)
|
|
3526
|
+
or (feature_meta.name in self.fit_generated_features)
|
|
3527
|
+
or (feature_meta.name == COUNTRY)
|
|
3547
3528
|
):
|
|
3548
3529
|
continue
|
|
3549
3530
|
|
|
3550
|
-
|
|
3551
|
-
|
|
3552
|
-
self.
|
|
3553
|
-
|
|
3554
|
-
feature_sample = np.random.choice(features_df[feature_meta.name].dropna().unique(), 3).tolist()
|
|
3555
|
-
if len(feature_sample) > 0 and isinstance(feature_sample[0], float):
|
|
3556
|
-
feature_sample = [round(f, 4) for f in feature_sample]
|
|
3557
|
-
feature_sample = [str(f) for f in feature_sample]
|
|
3558
|
-
feature_sample = ", ".join(feature_sample)
|
|
3559
|
-
if len(feature_sample) > 30:
|
|
3560
|
-
feature_sample = feature_sample[:30] + "..."
|
|
3561
|
-
|
|
3562
|
-
internal_provider = feature_meta.data_provider or "Upgini"
|
|
3563
|
-
providers = list_or_single(feature_meta.data_providers, feature_meta.data_provider)
|
|
3564
|
-
provider_links = list_or_single(feature_meta.data_provider_links, feature_meta.data_provider_link)
|
|
3565
|
-
if providers:
|
|
3566
|
-
provider = make_links(providers, provider_links)
|
|
3567
|
-
else:
|
|
3568
|
-
provider = to_anchor("https://upgini.com", "Upgini")
|
|
3531
|
+
is_client_feature = feature_meta.name in x_columns
|
|
3532
|
+
# In select_features mode we select also from etalon features and need to show them
|
|
3533
|
+
if not self.select_features and is_client_feature:
|
|
3534
|
+
continue
|
|
3569
3535
|
|
|
3570
|
-
|
|
3571
|
-
|
|
3572
|
-
if not feature_meta.name.endswith("_country") and not feature_meta.name.endswith("_postal_code")
|
|
3573
|
-
else ""
|
|
3574
|
-
)
|
|
3575
|
-
sources = list_or_single(feature_meta.data_sources, feature_meta.data_source)
|
|
3576
|
-
source_links = list_or_single(feature_meta.data_source_links, feature_meta.data_source_link)
|
|
3577
|
-
if sources:
|
|
3578
|
-
source = make_links(sources, source_links)
|
|
3579
|
-
else:
|
|
3580
|
-
source = internal_source
|
|
3536
|
+
self.feature_names_.append(feature_meta.name)
|
|
3537
|
+
self.feature_importances_.append(_round_shap_value(feature_meta.shap_value))
|
|
3581
3538
|
|
|
3582
|
-
|
|
3583
|
-
|
|
3584
|
-
|
|
3585
|
-
|
|
3586
|
-
feature_name = internal_feature_name
|
|
3587
|
-
|
|
3588
|
-
features_info.append(
|
|
3589
|
-
{
|
|
3590
|
-
self.bundle.get("features_info_name"): feature_name,
|
|
3591
|
-
self.bundle.get("features_info_shap"): self._round_shap_value(feature_meta.shap_value),
|
|
3592
|
-
self.bundle.get("features_info_hitrate"): feature_meta.hit_rate,
|
|
3593
|
-
self.bundle.get("features_info_value_preview"): feature_sample,
|
|
3594
|
-
self.bundle.get("features_info_provider"): provider,
|
|
3595
|
-
self.bundle.get("features_info_source"): source,
|
|
3596
|
-
self.bundle.get("features_info_update_frequency"): feature_meta.update_frequency,
|
|
3597
|
-
}
|
|
3598
|
-
)
|
|
3599
|
-
features_info_without_links.append(
|
|
3600
|
-
{
|
|
3601
|
-
self.bundle.get("features_info_name"): internal_feature_name,
|
|
3602
|
-
self.bundle.get("features_info_shap"): self._round_shap_value(feature_meta.shap_value),
|
|
3603
|
-
self.bundle.get("features_info_hitrate"): feature_meta.hit_rate,
|
|
3604
|
-
self.bundle.get("features_info_value_preview"): feature_sample,
|
|
3605
|
-
self.bundle.get("features_info_provider"): internal_provider,
|
|
3606
|
-
self.bundle.get("features_info_source"): internal_source,
|
|
3607
|
-
self.bundle.get("features_info_update_frequency"): feature_meta.update_frequency,
|
|
3608
|
-
}
|
|
3609
|
-
)
|
|
3610
|
-
internal_features_info.append(
|
|
3611
|
-
{
|
|
3612
|
-
self.bundle.get("features_info_name"): internal_feature_name,
|
|
3613
|
-
"feature_link": feature_meta.doc_link,
|
|
3614
|
-
self.bundle.get("features_info_shap"): self._round_shap_value(feature_meta.shap_value),
|
|
3615
|
-
self.bundle.get("features_info_hitrate"): feature_meta.hit_rate,
|
|
3616
|
-
self.bundle.get("features_info_value_preview"): feature_sample,
|
|
3617
|
-
self.bundle.get("features_info_provider"): internal_provider,
|
|
3618
|
-
"provider_link": feature_meta.data_provider_link,
|
|
3619
|
-
self.bundle.get("features_info_source"): internal_source,
|
|
3620
|
-
"source_link": feature_meta.data_source_link,
|
|
3621
|
-
self.bundle.get("features_info_commercial_schema"): feature_meta.commercial_schema or "",
|
|
3622
|
-
self.bundle.get("features_info_update_frequency"): feature_meta.update_frequency,
|
|
3623
|
-
}
|
|
3624
|
-
)
|
|
3539
|
+
feature_info = FeatureInfo.from_metadata(feature_meta, features_df, is_client_feature)
|
|
3540
|
+
features_info.append(feature_info.to_row(self.bundle))
|
|
3541
|
+
features_info_without_links.append(feature_info.to_row_without_links(self.bundle))
|
|
3542
|
+
internal_features_info.append(feature_info.to_internal_row(self.bundle))
|
|
3625
3543
|
|
|
3626
3544
|
if len(features_info) > 0:
|
|
3627
3545
|
self.features_info = pd.DataFrame(features_info)
|
|
@@ -137,8 +137,9 @@ class DateTimeSearchKeyConverter:
|
|
|
137
137
|
quarter_end_month = np.where(quarter == 4, 1, month + 3)
|
|
138
138
|
|
|
139
139
|
end = pd.to_datetime({"year": quarter_end_year, "month": quarter_end_month, "day": 1})
|
|
140
|
+
end.index = df.index
|
|
140
141
|
|
|
141
|
-
df["days_in_quarter"] = (end
|
|
142
|
+
df["days_in_quarter"] = (end - start).dt.days
|
|
142
143
|
|
|
143
144
|
add_cyclical_features(df, "day_in_quarter", df["days_in_quarter"]) # Days in the quarter
|
|
144
145
|
|
|
@@ -0,0 +1,172 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
import itertools
|
|
3
|
+
from typing import Dict, List
|
|
4
|
+
|
|
5
|
+
import numpy as np
|
|
6
|
+
import pandas as pd
|
|
7
|
+
|
|
8
|
+
from upgini.metadata import FeaturesMetadataV2
|
|
9
|
+
from upgini.resource_bundle import ResourceBundle
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
LLM_SOURCE = "LLM with external data augmentation"
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@dataclass
|
|
16
|
+
class FeatureInfo:
|
|
17
|
+
name: str
|
|
18
|
+
internal_name: str
|
|
19
|
+
rounded_shap: float
|
|
20
|
+
hitrate: float
|
|
21
|
+
value_preview: str
|
|
22
|
+
provider: str
|
|
23
|
+
internal_provider: str
|
|
24
|
+
source: str
|
|
25
|
+
internal_source: str
|
|
26
|
+
update_frequency: str
|
|
27
|
+
commercial_schema: str
|
|
28
|
+
doc_link: str
|
|
29
|
+
data_provider_link: str
|
|
30
|
+
data_source_link: str
|
|
31
|
+
|
|
32
|
+
@staticmethod
|
|
33
|
+
def from_metadata(feature_meta: FeaturesMetadataV2, data: pd.DataFrame, is_client_feature: bool) -> "FeatureInfo":
|
|
34
|
+
return FeatureInfo(
|
|
35
|
+
name=_get_name(feature_meta),
|
|
36
|
+
internal_name=_get_internal_name(feature_meta),
|
|
37
|
+
rounded_shap=_round_shap_value(feature_meta.shap_value),
|
|
38
|
+
hitrate=feature_meta.hit_rate,
|
|
39
|
+
value_preview=_get_feature_sample(feature_meta, data),
|
|
40
|
+
provider=_get_provider(feature_meta, is_client_feature),
|
|
41
|
+
internal_provider=_get_internal_provider(feature_meta, is_client_feature),
|
|
42
|
+
source=_get_source(feature_meta, is_client_feature),
|
|
43
|
+
internal_source=_get_internal_source(feature_meta, is_client_feature),
|
|
44
|
+
update_frequency=feature_meta.update_frequency,
|
|
45
|
+
commercial_schema=feature_meta.commercial_schema,
|
|
46
|
+
doc_link=feature_meta.doc_link,
|
|
47
|
+
data_provider_link=feature_meta.data_provider_link,
|
|
48
|
+
data_source_link=feature_meta.data_source_link,
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
def to_row(self, bundle: ResourceBundle) -> Dict[str, str]:
|
|
52
|
+
return {
|
|
53
|
+
bundle.get("features_info_name"): self.name,
|
|
54
|
+
bundle.get("features_info_shap"): self.rounded_shap,
|
|
55
|
+
bundle.get("features_info_hitrate"): self.hitrate,
|
|
56
|
+
bundle.get("features_info_value_preview"): self.value_preview,
|
|
57
|
+
bundle.get("features_info_provider"): self.provider,
|
|
58
|
+
bundle.get("features_info_source"): self.source,
|
|
59
|
+
bundle.get("features_info_update_frequency"): self.update_frequency,
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
def to_row_without_links(self, bundle: ResourceBundle) -> Dict[str, str]:
|
|
63
|
+
return {
|
|
64
|
+
bundle.get("features_info_name"): self.internal_name,
|
|
65
|
+
bundle.get("features_info_shap"): self.rounded_shap,
|
|
66
|
+
bundle.get("features_info_hitrate"): self.hitrate,
|
|
67
|
+
bundle.get("features_info_value_preview"): self.value_preview,
|
|
68
|
+
bundle.get("features_info_provider"): self.internal_provider,
|
|
69
|
+
bundle.get("features_info_source"): self.internal_source,
|
|
70
|
+
bundle.get("features_info_update_frequency"): self.update_frequency,
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
def to_internal_row(self, bundle: ResourceBundle) -> Dict[str, str]:
|
|
74
|
+
return {
|
|
75
|
+
bundle.get("features_info_name"): self.internal_name,
|
|
76
|
+
"feature_link": self.doc_link,
|
|
77
|
+
bundle.get("features_info_shap"): self.rounded_shap,
|
|
78
|
+
bundle.get("features_info_hitrate"): self.hitrate,
|
|
79
|
+
bundle.get("features_info_value_preview"): self.value_preview,
|
|
80
|
+
bundle.get("features_info_provider"): self.internal_provider,
|
|
81
|
+
"provider_link": self.data_provider_link,
|
|
82
|
+
bundle.get("features_info_source"): self.internal_source,
|
|
83
|
+
"source_link": self.data_source_link,
|
|
84
|
+
bundle.get("features_info_commercial_schema"): self.commercial_schema or "",
|
|
85
|
+
bundle.get("features_info_update_frequency"): self.update_frequency,
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def _get_feature_sample(feature_meta: FeaturesMetadataV2, data: pd.DataFrame) -> str:
|
|
90
|
+
if feature_meta.name in data.columns:
|
|
91
|
+
feature_sample = np.random.choice(data[feature_meta.name].dropna().unique(), 3).tolist()
|
|
92
|
+
if len(feature_sample) > 0 and isinstance(feature_sample[0], float):
|
|
93
|
+
feature_sample = [round(f, 4) for f in feature_sample]
|
|
94
|
+
feature_sample = [str(f) for f in feature_sample]
|
|
95
|
+
feature_sample = ", ".join(feature_sample)
|
|
96
|
+
if len(feature_sample) > 30:
|
|
97
|
+
feature_sample = feature_sample[:30] + "..."
|
|
98
|
+
else:
|
|
99
|
+
feature_sample = ""
|
|
100
|
+
return feature_sample
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def _get_name(feature_meta: FeaturesMetadataV2) -> str:
|
|
104
|
+
if feature_meta.doc_link:
|
|
105
|
+
return _to_anchor(feature_meta.doc_link, feature_meta.name)
|
|
106
|
+
else:
|
|
107
|
+
return feature_meta.name
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def _get_internal_name(feature_meta: FeaturesMetadataV2) -> str:
|
|
111
|
+
return feature_meta.name
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def _get_provider(feature_meta: FeaturesMetadataV2, is_client_feature: bool) -> str:
|
|
115
|
+
providers = _list_or_single(feature_meta.data_providers, feature_meta.data_provider)
|
|
116
|
+
provider_links = _list_or_single(feature_meta.data_provider_links, feature_meta.data_provider_link)
|
|
117
|
+
if providers:
|
|
118
|
+
provider = _make_links(providers, provider_links)
|
|
119
|
+
else:
|
|
120
|
+
provider = "" if is_client_feature else _to_anchor("https://upgini.com", "Upgini")
|
|
121
|
+
return provider
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def _get_internal_provider(feature_meta: FeaturesMetadataV2, is_client_feature: bool) -> str:
|
|
125
|
+
return "" if is_client_feature else (feature_meta.data_provider or "Upgini")
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def _get_source(feature_meta: FeaturesMetadataV2, is_client_feature: bool) -> str:
|
|
129
|
+
sources = _list_or_single(feature_meta.data_sources, feature_meta.data_source)
|
|
130
|
+
source_links = _list_or_single(feature_meta.data_source_links, feature_meta.data_source_link)
|
|
131
|
+
if sources:
|
|
132
|
+
source = _make_links(sources, source_links)
|
|
133
|
+
else:
|
|
134
|
+
source = _get_internal_source(feature_meta, is_client_feature)
|
|
135
|
+
return source
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
def _get_internal_source(feature_meta: FeaturesMetadataV2, is_client_feature: bool) -> str:
|
|
139
|
+
return feature_meta.data_source or (
|
|
140
|
+
LLM_SOURCE
|
|
141
|
+
if not feature_meta.name.endswith("_country")
|
|
142
|
+
and not feature_meta.name.endswith("_postal_code")
|
|
143
|
+
and not is_client_feature
|
|
144
|
+
else ""
|
|
145
|
+
)
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def _list_or_single(lst: List[str], single: str):
|
|
149
|
+
return lst or ([single] if single else [])
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
def _to_anchor(link: str, value: str) -> str:
|
|
153
|
+
if not value:
|
|
154
|
+
return ""
|
|
155
|
+
elif not link:
|
|
156
|
+
return value
|
|
157
|
+
elif value == LLM_SOURCE:
|
|
158
|
+
return value
|
|
159
|
+
else:
|
|
160
|
+
return f"<a href='{link}' target='_blank' rel='noopener noreferrer'>{value}</a>"
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
def _make_links(names: List[str], links: List[str]):
|
|
164
|
+
all_links = [_to_anchor(link, name) for name, link in itertools.zip_longest(names, links)]
|
|
165
|
+
return ",".join(all_links)
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
def _round_shap_value(shap: float) -> float:
|
|
169
|
+
if shap > 0.0 and shap < 0.0001:
|
|
170
|
+
return 0.0001
|
|
171
|
+
else:
|
|
172
|
+
return round(shap, 4)
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
__version__ = "1.2.29a4"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|