upgini 1.2.29a4__py3-none-any.whl → 1.2.29a6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- upgini/__about__.py +1 -1
- upgini/features_enricher.py +41 -111
- upgini/utils/datetime_utils.py +2 -1
- upgini/utils/feature_info.py +172 -0
- {upgini-1.2.29a4.dist-info → upgini-1.2.29a6.dist-info}/METADATA +1 -1
- {upgini-1.2.29a4.dist-info → upgini-1.2.29a6.dist-info}/RECORD +8 -7
- {upgini-1.2.29a4.dist-info → upgini-1.2.29a6.dist-info}/WHEEL +0 -0
- {upgini-1.2.29a4.dist-info → upgini-1.2.29a6.dist-info}/licenses/LICENSE +0 -0
upgini/__about__.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "1.2.
|
|
1
|
+
__version__ = "1.2.29a6"
|
upgini/features_enricher.py
CHANGED
|
@@ -54,6 +54,7 @@ from upgini.metadata import (
|
|
|
54
54
|
SYSTEM_RECORD_ID,
|
|
55
55
|
TARGET,
|
|
56
56
|
CVType,
|
|
57
|
+
FeaturesMetadataV2,
|
|
57
58
|
FileColumnMeaningType,
|
|
58
59
|
ModelTaskType,
|
|
59
60
|
RuntimeParameters,
|
|
@@ -95,6 +96,7 @@ from upgini.utils.email_utils import (
|
|
|
95
96
|
EmailSearchKeyConverter,
|
|
96
97
|
EmailSearchKeyDetector,
|
|
97
98
|
)
|
|
99
|
+
from upgini.utils.feature_info import FeatureInfo, _round_shap_value
|
|
98
100
|
from upgini.utils.features_validator import FeaturesValidator
|
|
99
101
|
from upgini.utils.format import Format
|
|
100
102
|
from upgini.utils.ip_utils import IpSearchKeyConverter
|
|
@@ -158,6 +160,10 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
158
160
|
|
|
159
161
|
shared_datasets: list of str, optional (default=None)
|
|
160
162
|
List of private shared dataset ids for custom search
|
|
163
|
+
|
|
164
|
+
select_features: bool, optional (default=False)
|
|
165
|
+
If True, return only selected features both from input and data sources.
|
|
166
|
+
Otherwise, return all features from input and only selected features from data sources.
|
|
161
167
|
"""
|
|
162
168
|
|
|
163
169
|
TARGET_NAME = "target"
|
|
@@ -224,6 +230,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
224
230
|
client_visitorid: Optional[str] = None,
|
|
225
231
|
custom_bundle_config: Optional[str] = None,
|
|
226
232
|
add_date_if_missing: bool = True,
|
|
233
|
+
select_features: bool = False,
|
|
227
234
|
**kwargs,
|
|
228
235
|
):
|
|
229
236
|
self.bundle = get_custom_bundle(custom_bundle_config)
|
|
@@ -277,8 +284,11 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
277
284
|
self._relevant_data_sources_wo_links: pd.DataFrame = self.EMPTY_DATA_SOURCES
|
|
278
285
|
self.metrics: Optional[pd.DataFrame] = None
|
|
279
286
|
self.feature_names_ = []
|
|
287
|
+
self.dropped_client_feature_names_ = []
|
|
280
288
|
self.feature_importances_ = []
|
|
281
289
|
self.search_id = search_id
|
|
290
|
+
self.select_features = select_features
|
|
291
|
+
|
|
282
292
|
if search_id:
|
|
283
293
|
search_task = SearchTask(search_id, rest_client=self.rest_client, logger=self.logger)
|
|
284
294
|
|
|
@@ -1201,9 +1211,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1201
1211
|
|
|
1202
1212
|
def _update_shap_values(self, new_shaps: Dict[str, float]):
|
|
1203
1213
|
new_shaps = {
|
|
1204
|
-
feature:
|
|
1205
|
-
for feature, shap in new_shaps.items()
|
|
1206
|
-
if feature in self.feature_names_
|
|
1214
|
+
feature: _round_shap_value(shap) for feature, shap in new_shaps.items() if feature in self.feature_names_
|
|
1207
1215
|
}
|
|
1208
1216
|
features_importances = list(new_shaps.items())
|
|
1209
1217
|
features_importances.sort(key=lambda m: (-m[1], m[0]))
|
|
@@ -1440,7 +1448,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1440
1448
|
client_features = [
|
|
1441
1449
|
c
|
|
1442
1450
|
for c in X_sampled.columns.to_list()
|
|
1443
|
-
if c
|
|
1451
|
+
if (not self.select_features or c in self.feature_names_)
|
|
1452
|
+
and c
|
|
1444
1453
|
not in (
|
|
1445
1454
|
excluding_search_keys
|
|
1446
1455
|
+ list(self.fit_dropped_features)
|
|
@@ -2066,7 +2075,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2066
2075
|
|
|
2067
2076
|
is_demo_dataset = hash_input(validated_X) in DEMO_DATASET_HASHES
|
|
2068
2077
|
|
|
2069
|
-
columns_to_drop = [
|
|
2078
|
+
columns_to_drop = [
|
|
2079
|
+
c for c in validated_X.columns if c in self.feature_names_ and c in self.dropped_client_feature_names_
|
|
2080
|
+
]
|
|
2070
2081
|
if len(columns_to_drop) > 0:
|
|
2071
2082
|
msg = self.bundle.get("x_contains_enriching_columns").format(columns_to_drop)
|
|
2072
2083
|
self.logger.warning(msg)
|
|
@@ -2322,11 +2333,13 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2322
2333
|
else:
|
|
2323
2334
|
result = enrich()
|
|
2324
2335
|
|
|
2325
|
-
|
|
2326
|
-
|
|
2327
|
-
|
|
2336
|
+
selecting_columns = [
|
|
2337
|
+
c
|
|
2338
|
+
for c in itertools.chain(validated_X.columns.tolist(), generated_features)
|
|
2339
|
+
if c not in self.dropped_client_feature_names_
|
|
2328
2340
|
]
|
|
2329
|
-
|
|
2341
|
+
filtered_columns = self.__filtered_enriched_features(importance_threshold, max_features)
|
|
2342
|
+
selecting_columns.extend(c for c in filtered_columns if c in result.columns and c not in validated_X.columns)
|
|
2330
2343
|
if add_fit_system_record_id:
|
|
2331
2344
|
selecting_columns.append(SORT_ID)
|
|
2332
2345
|
|
|
@@ -3493,15 +3506,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3493
3506
|
|
|
3494
3507
|
return result_train, result_eval_sets
|
|
3495
3508
|
|
|
3496
|
-
@staticmethod
|
|
3497
|
-
def _round_shap_value(shap: float) -> float:
|
|
3498
|
-
if shap > 0.0 and shap < 0.0001:
|
|
3499
|
-
return 0.0001
|
|
3500
|
-
else:
|
|
3501
|
-
return round(shap, 4)
|
|
3502
|
-
|
|
3503
3509
|
def __prepare_feature_importances(self, trace_id: str, x_columns: List[str], silent=False):
|
|
3504
|
-
llm_source = "LLM with external data augmentation"
|
|
3505
3510
|
if self._search_task is None:
|
|
3506
3511
|
raise NotFittedError(self.bundle.get("transform_unfitted_enricher"))
|
|
3507
3512
|
features_meta = self._search_task.get_all_features_metadata_v2()
|
|
@@ -3512,116 +3517,41 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3512
3517
|
features_df = self._search_task.get_all_initial_raw_features(trace_id, metrics_calculation=True)
|
|
3513
3518
|
|
|
3514
3519
|
self.feature_names_ = []
|
|
3520
|
+
self.dropped_client_feature_names_ = []
|
|
3515
3521
|
self.feature_importances_ = []
|
|
3516
3522
|
features_info = []
|
|
3517
3523
|
features_info_without_links = []
|
|
3518
3524
|
internal_features_info = []
|
|
3519
3525
|
|
|
3520
|
-
def list_or_single(lst: List[str], single: str):
|
|
3521
|
-
return lst or ([single] if single else [])
|
|
3522
|
-
|
|
3523
|
-
def to_anchor(link: str, value: str) -> str:
|
|
3524
|
-
if not value:
|
|
3525
|
-
return ""
|
|
3526
|
-
elif not link:
|
|
3527
|
-
return value
|
|
3528
|
-
elif value == llm_source:
|
|
3529
|
-
return value
|
|
3530
|
-
else:
|
|
3531
|
-
return f"<a href='{link}' target='_blank' rel='noopener noreferrer'>{value}</a>"
|
|
3532
|
-
|
|
3533
|
-
def make_links(names: List[str], links: List[str]):
|
|
3534
|
-
all_links = [to_anchor(link, name) for name, link in itertools.zip_longest(names, links)]
|
|
3535
|
-
return ",".join(all_links)
|
|
3536
|
-
|
|
3537
3526
|
features_meta.sort(key=lambda m: (-m.shap_value, m.name))
|
|
3538
3527
|
for feature_meta in features_meta:
|
|
3539
3528
|
if feature_meta.name in original_names_dict.keys():
|
|
3540
3529
|
feature_meta.name = original_names_dict[feature_meta.name]
|
|
3541
|
-
|
|
3530
|
+
|
|
3531
|
+
is_client_feature = feature_meta.name in x_columns
|
|
3532
|
+
|
|
3533
|
+
if feature_meta.shap_value == 0.0:
|
|
3534
|
+
if self.select_features:
|
|
3535
|
+
self.dropped_client_feature_names_.append(feature_meta.name)
|
|
3536
|
+
continue
|
|
3537
|
+
|
|
3538
|
+
# Use only important features
|
|
3542
3539
|
if (
|
|
3543
|
-
feature_meta.name in
|
|
3540
|
+
feature_meta.name in self.fit_generated_features
|
|
3544
3541
|
or feature_meta.name == COUNTRY
|
|
3545
|
-
|
|
3546
|
-
or
|
|
3542
|
+
# In select_features mode we select also from etalon features and need to show them
|
|
3543
|
+
or (not self.select_features and is_client_feature)
|
|
3547
3544
|
):
|
|
3548
3545
|
continue
|
|
3549
3546
|
|
|
3550
|
-
feature_sample = []
|
|
3551
|
-
self.feature_names_.append(feature_meta.name)
|
|
3552
|
-
self.feature_importances_.append(self._round_shap_value(feature_meta.shap_value))
|
|
3553
|
-
if feature_meta.name in features_df.columns:
|
|
3554
|
-
feature_sample = np.random.choice(features_df[feature_meta.name].dropna().unique(), 3).tolist()
|
|
3555
|
-
if len(feature_sample) > 0 and isinstance(feature_sample[0], float):
|
|
3556
|
-
feature_sample = [round(f, 4) for f in feature_sample]
|
|
3557
|
-
feature_sample = [str(f) for f in feature_sample]
|
|
3558
|
-
feature_sample = ", ".join(feature_sample)
|
|
3559
|
-
if len(feature_sample) > 30:
|
|
3560
|
-
feature_sample = feature_sample[:30] + "..."
|
|
3561
|
-
|
|
3562
|
-
internal_provider = feature_meta.data_provider or "Upgini"
|
|
3563
|
-
providers = list_or_single(feature_meta.data_providers, feature_meta.data_provider)
|
|
3564
|
-
provider_links = list_or_single(feature_meta.data_provider_links, feature_meta.data_provider_link)
|
|
3565
|
-
if providers:
|
|
3566
|
-
provider = make_links(providers, provider_links)
|
|
3567
|
-
else:
|
|
3568
|
-
provider = to_anchor("https://upgini.com", "Upgini")
|
|
3569
3547
|
|
|
3570
|
-
|
|
3571
|
-
|
|
3572
|
-
if not feature_meta.name.endswith("_country") and not feature_meta.name.endswith("_postal_code")
|
|
3573
|
-
else ""
|
|
3574
|
-
)
|
|
3575
|
-
sources = list_or_single(feature_meta.data_sources, feature_meta.data_source)
|
|
3576
|
-
source_links = list_or_single(feature_meta.data_source_links, feature_meta.data_source_link)
|
|
3577
|
-
if sources:
|
|
3578
|
-
source = make_links(sources, source_links)
|
|
3579
|
-
else:
|
|
3580
|
-
source = internal_source
|
|
3548
|
+
self.feature_names_.append(feature_meta.name)
|
|
3549
|
+
self.feature_importances_.append(_round_shap_value(feature_meta.shap_value))
|
|
3581
3550
|
|
|
3582
|
-
|
|
3583
|
-
|
|
3584
|
-
|
|
3585
|
-
|
|
3586
|
-
feature_name = internal_feature_name
|
|
3587
|
-
|
|
3588
|
-
features_info.append(
|
|
3589
|
-
{
|
|
3590
|
-
self.bundle.get("features_info_name"): feature_name,
|
|
3591
|
-
self.bundle.get("features_info_shap"): self._round_shap_value(feature_meta.shap_value),
|
|
3592
|
-
self.bundle.get("features_info_hitrate"): feature_meta.hit_rate,
|
|
3593
|
-
self.bundle.get("features_info_value_preview"): feature_sample,
|
|
3594
|
-
self.bundle.get("features_info_provider"): provider,
|
|
3595
|
-
self.bundle.get("features_info_source"): source,
|
|
3596
|
-
self.bundle.get("features_info_update_frequency"): feature_meta.update_frequency,
|
|
3597
|
-
}
|
|
3598
|
-
)
|
|
3599
|
-
features_info_without_links.append(
|
|
3600
|
-
{
|
|
3601
|
-
self.bundle.get("features_info_name"): internal_feature_name,
|
|
3602
|
-
self.bundle.get("features_info_shap"): self._round_shap_value(feature_meta.shap_value),
|
|
3603
|
-
self.bundle.get("features_info_hitrate"): feature_meta.hit_rate,
|
|
3604
|
-
self.bundle.get("features_info_value_preview"): feature_sample,
|
|
3605
|
-
self.bundle.get("features_info_provider"): internal_provider,
|
|
3606
|
-
self.bundle.get("features_info_source"): internal_source,
|
|
3607
|
-
self.bundle.get("features_info_update_frequency"): feature_meta.update_frequency,
|
|
3608
|
-
}
|
|
3609
|
-
)
|
|
3610
|
-
internal_features_info.append(
|
|
3611
|
-
{
|
|
3612
|
-
self.bundle.get("features_info_name"): internal_feature_name,
|
|
3613
|
-
"feature_link": feature_meta.doc_link,
|
|
3614
|
-
self.bundle.get("features_info_shap"): self._round_shap_value(feature_meta.shap_value),
|
|
3615
|
-
self.bundle.get("features_info_hitrate"): feature_meta.hit_rate,
|
|
3616
|
-
self.bundle.get("features_info_value_preview"): feature_sample,
|
|
3617
|
-
self.bundle.get("features_info_provider"): internal_provider,
|
|
3618
|
-
"provider_link": feature_meta.data_provider_link,
|
|
3619
|
-
self.bundle.get("features_info_source"): internal_source,
|
|
3620
|
-
"source_link": feature_meta.data_source_link,
|
|
3621
|
-
self.bundle.get("features_info_commercial_schema"): feature_meta.commercial_schema or "",
|
|
3622
|
-
self.bundle.get("features_info_update_frequency"): feature_meta.update_frequency,
|
|
3623
|
-
}
|
|
3624
|
-
)
|
|
3551
|
+
feature_info = FeatureInfo.from_metadata(feature_meta, features_df, is_client_feature)
|
|
3552
|
+
features_info.append(feature_info.to_row(self.bundle))
|
|
3553
|
+
features_info_without_links.append(feature_info.to_row_without_links(self.bundle))
|
|
3554
|
+
internal_features_info.append(feature_info.to_internal_row(self.bundle))
|
|
3625
3555
|
|
|
3626
3556
|
if len(features_info) > 0:
|
|
3627
3557
|
self.features_info = pd.DataFrame(features_info)
|
upgini/utils/datetime_utils.py
CHANGED
|
@@ -137,8 +137,9 @@ class DateTimeSearchKeyConverter:
|
|
|
137
137
|
quarter_end_month = np.where(quarter == 4, 1, month + 3)
|
|
138
138
|
|
|
139
139
|
end = pd.to_datetime({"year": quarter_end_year, "month": quarter_end_month, "day": 1})
|
|
140
|
+
end.index = df.index
|
|
140
141
|
|
|
141
|
-
df["days_in_quarter"] = (end
|
|
142
|
+
df["days_in_quarter"] = (end - start).dt.days
|
|
142
143
|
|
|
143
144
|
add_cyclical_features(df, "day_in_quarter", df["days_in_quarter"]) # Days in the quarter
|
|
144
145
|
|
|
@@ -0,0 +1,172 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
import itertools
|
|
3
|
+
from typing import Dict, List
|
|
4
|
+
|
|
5
|
+
import numpy as np
|
|
6
|
+
import pandas as pd
|
|
7
|
+
|
|
8
|
+
from upgini.metadata import FeaturesMetadataV2
|
|
9
|
+
from upgini.resource_bundle import ResourceBundle
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
LLM_SOURCE = "LLM with external data augmentation"
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@dataclass
|
|
16
|
+
class FeatureInfo:
|
|
17
|
+
name: str
|
|
18
|
+
internal_name: str
|
|
19
|
+
rounded_shap: float
|
|
20
|
+
hitrate: float
|
|
21
|
+
value_preview: str
|
|
22
|
+
provider: str
|
|
23
|
+
internal_provider: str
|
|
24
|
+
source: str
|
|
25
|
+
internal_source: str
|
|
26
|
+
update_frequency: str
|
|
27
|
+
commercial_schema: str
|
|
28
|
+
doc_link: str
|
|
29
|
+
data_provider_link: str
|
|
30
|
+
data_source_link: str
|
|
31
|
+
|
|
32
|
+
@staticmethod
|
|
33
|
+
def from_metadata(feature_meta: FeaturesMetadataV2, data: pd.DataFrame, is_client_feature: bool) -> "FeatureInfo":
|
|
34
|
+
return FeatureInfo(
|
|
35
|
+
name=_get_name(feature_meta),
|
|
36
|
+
internal_name=_get_internal_name(feature_meta),
|
|
37
|
+
rounded_shap=_round_shap_value(feature_meta.shap_value),
|
|
38
|
+
hitrate=feature_meta.hit_rate,
|
|
39
|
+
value_preview=_get_feature_sample(feature_meta, data),
|
|
40
|
+
provider=_get_provider(feature_meta, is_client_feature),
|
|
41
|
+
internal_provider=_get_internal_provider(feature_meta, is_client_feature),
|
|
42
|
+
source=_get_source(feature_meta, is_client_feature),
|
|
43
|
+
internal_source=_get_internal_source(feature_meta, is_client_feature),
|
|
44
|
+
update_frequency=feature_meta.update_frequency,
|
|
45
|
+
commercial_schema=feature_meta.commercial_schema,
|
|
46
|
+
doc_link=feature_meta.doc_link,
|
|
47
|
+
data_provider_link=feature_meta.data_provider_link,
|
|
48
|
+
data_source_link=feature_meta.data_source_link,
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
def to_row(self, bundle: ResourceBundle) -> Dict[str, str]:
|
|
52
|
+
return {
|
|
53
|
+
bundle.get("features_info_name"): self.name,
|
|
54
|
+
bundle.get("features_info_shap"): self.rounded_shap,
|
|
55
|
+
bundle.get("features_info_hitrate"): self.hitrate,
|
|
56
|
+
bundle.get("features_info_value_preview"): self.value_preview,
|
|
57
|
+
bundle.get("features_info_provider"): self.provider,
|
|
58
|
+
bundle.get("features_info_source"): self.source,
|
|
59
|
+
bundle.get("features_info_update_frequency"): self.update_frequency,
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
def to_row_without_links(self, bundle: ResourceBundle) -> Dict[str, str]:
|
|
63
|
+
return {
|
|
64
|
+
bundle.get("features_info_name"): self.internal_name,
|
|
65
|
+
bundle.get("features_info_shap"): self.rounded_shap,
|
|
66
|
+
bundle.get("features_info_hitrate"): self.hitrate,
|
|
67
|
+
bundle.get("features_info_value_preview"): self.value_preview,
|
|
68
|
+
bundle.get("features_info_provider"): self.internal_provider,
|
|
69
|
+
bundle.get("features_info_source"): self.internal_source,
|
|
70
|
+
bundle.get("features_info_update_frequency"): self.update_frequency,
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
def to_internal_row(self, bundle: ResourceBundle) -> Dict[str, str]:
|
|
74
|
+
return {
|
|
75
|
+
bundle.get("features_info_name"): self.internal_name,
|
|
76
|
+
"feature_link": self.doc_link,
|
|
77
|
+
bundle.get("features_info_shap"): self.rounded_shap,
|
|
78
|
+
bundle.get("features_info_hitrate"): self.hitrate,
|
|
79
|
+
bundle.get("features_info_value_preview"): self.value_preview,
|
|
80
|
+
bundle.get("features_info_provider"): self.internal_provider,
|
|
81
|
+
"provider_link": self.data_provider_link,
|
|
82
|
+
bundle.get("features_info_source"): self.internal_source,
|
|
83
|
+
"source_link": self.data_source_link,
|
|
84
|
+
bundle.get("features_info_commercial_schema"): self.commercial_schema or "",
|
|
85
|
+
bundle.get("features_info_update_frequency"): self.update_frequency,
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def _get_feature_sample(feature_meta: FeaturesMetadataV2, data: pd.DataFrame) -> str:
|
|
90
|
+
if feature_meta.name in data.columns:
|
|
91
|
+
feature_sample = np.random.choice(data[feature_meta.name].dropna().unique(), 3).tolist()
|
|
92
|
+
if len(feature_sample) > 0 and isinstance(feature_sample[0], float):
|
|
93
|
+
feature_sample = [round(f, 4) for f in feature_sample]
|
|
94
|
+
feature_sample = [str(f) for f in feature_sample]
|
|
95
|
+
feature_sample = ", ".join(feature_sample)
|
|
96
|
+
if len(feature_sample) > 30:
|
|
97
|
+
feature_sample = feature_sample[:30] + "..."
|
|
98
|
+
else:
|
|
99
|
+
feature_sample = ""
|
|
100
|
+
return feature_sample
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def _get_name(feature_meta: FeaturesMetadataV2) -> str:
|
|
104
|
+
if feature_meta.doc_link:
|
|
105
|
+
return _to_anchor(feature_meta.doc_link, feature_meta.name)
|
|
106
|
+
else:
|
|
107
|
+
return feature_meta.name
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def _get_internal_name(feature_meta: FeaturesMetadataV2) -> str:
|
|
111
|
+
return feature_meta.name
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def _get_provider(feature_meta: FeaturesMetadataV2, is_client_feature: bool) -> str:
|
|
115
|
+
providers = _list_or_single(feature_meta.data_providers, feature_meta.data_provider)
|
|
116
|
+
provider_links = _list_or_single(feature_meta.data_provider_links, feature_meta.data_provider_link)
|
|
117
|
+
if providers:
|
|
118
|
+
provider = _make_links(providers, provider_links)
|
|
119
|
+
else:
|
|
120
|
+
provider = "" if is_client_feature else _to_anchor("https://upgini.com", "Upgini")
|
|
121
|
+
return provider
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def _get_internal_provider(feature_meta: FeaturesMetadataV2, is_client_feature: bool) -> str:
|
|
125
|
+
return "" if is_client_feature else (feature_meta.data_provider or "Upgini")
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def _get_source(feature_meta: FeaturesMetadataV2, is_client_feature: bool) -> str:
|
|
129
|
+
sources = _list_or_single(feature_meta.data_sources, feature_meta.data_source)
|
|
130
|
+
source_links = _list_or_single(feature_meta.data_source_links, feature_meta.data_source_link)
|
|
131
|
+
if sources:
|
|
132
|
+
source = _make_links(sources, source_links)
|
|
133
|
+
else:
|
|
134
|
+
source = _get_internal_source(feature_meta, is_client_feature)
|
|
135
|
+
return source
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
def _get_internal_source(feature_meta: FeaturesMetadataV2, is_client_feature: bool) -> str:
|
|
139
|
+
return feature_meta.data_source or (
|
|
140
|
+
LLM_SOURCE
|
|
141
|
+
if not feature_meta.name.endswith("_country")
|
|
142
|
+
and not feature_meta.name.endswith("_postal_code")
|
|
143
|
+
and not is_client_feature
|
|
144
|
+
else ""
|
|
145
|
+
)
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def _list_or_single(lst: List[str], single: str):
|
|
149
|
+
return lst or ([single] if single else [])
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
def _to_anchor(link: str, value: str) -> str:
|
|
153
|
+
if not value:
|
|
154
|
+
return ""
|
|
155
|
+
elif not link:
|
|
156
|
+
return value
|
|
157
|
+
elif value == LLM_SOURCE:
|
|
158
|
+
return value
|
|
159
|
+
else:
|
|
160
|
+
return f"<a href='{link}' target='_blank' rel='noopener noreferrer'>{value}</a>"
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
def _make_links(names: List[str], links: List[str]):
|
|
164
|
+
all_links = [_to_anchor(link, name) for name, link in itertools.zip_longest(names, links)]
|
|
165
|
+
return ",".join(all_links)
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
def _round_shap_value(shap: float) -> float:
|
|
169
|
+
if shap > 0.0 and shap < 0.0001:
|
|
170
|
+
return 0.0001
|
|
171
|
+
else:
|
|
172
|
+
return round(shap, 4)
|
|
@@ -1,9 +1,9 @@
|
|
|
1
|
-
upgini/__about__.py,sha256=
|
|
1
|
+
upgini/__about__.py,sha256=IwPdYvZC3KafuIyZFkN_uViBDHIV_KryoYm_uF-6Z5k,25
|
|
2
2
|
upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
|
|
3
3
|
upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
|
|
4
4
|
upgini/dataset.py,sha256=iPFiMJtk4HF1ytw9wCQr8H9RfoOKj_TIo8XYZKWgcMc,31331
|
|
5
5
|
upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
|
|
6
|
-
upgini/features_enricher.py,sha256
|
|
6
|
+
upgini/features_enricher.py,sha256=HAu6ZZSCW5BJ83fxuGjuEy2h283EO1sr3j_eUcVldsY,190873
|
|
7
7
|
upgini/http.py,sha256=21asexflvavydzCOONJDGQBtQanCElrbnqLXakJ9Cu8,42880
|
|
8
8
|
upgini/lazy_import.py,sha256=74gQ8JuA48BGRLxAo7lNHNKY2D2emMxrUxKGdxVGhuY,1012
|
|
9
9
|
upgini/metadata.py,sha256=osmzdNESeh7yP3BZday6N9Q3eaIHfzhhRM1d6NSgcf0,11223
|
|
@@ -43,11 +43,12 @@ upgini/utils/blocked_time_series.py,sha256=Uqr3vp4YqNclj2-PzEYqVy763GSXHn86sbpIl
|
|
|
43
43
|
upgini/utils/country_utils.py,sha256=lY-eXWwFVegdVENFttbvLcgGDjFO17Sex8hd2PyJaRk,6937
|
|
44
44
|
upgini/utils/custom_loss_utils.py,sha256=kieNZYBYZm5ZGBltF1F_jOSF4ea6C29rYuCyiDcqVNY,3857
|
|
45
45
|
upgini/utils/cv_utils.py,sha256=w6FQb9nO8BWDx88EF83NpjPLarK4eR4ia0Wg0kLBJC4,3525
|
|
46
|
-
upgini/utils/datetime_utils.py,sha256=
|
|
46
|
+
upgini/utils/datetime_utils.py,sha256=_uINXZUZ2MXvpGFBcxk_kZKMa1Umd8nhs8Iam-Gbwo0,13025
|
|
47
47
|
upgini/utils/deduplicate_utils.py,sha256=SMZx9IKIhWI5HqXepfKiQb3uDJrogQZtG6jcWuMo5Z4,8855
|
|
48
48
|
upgini/utils/display_utils.py,sha256=DsBjJ8jEYAh8BPgfAbzq5imoGFV6IACP20PQ78BQCX0,11964
|
|
49
49
|
upgini/utils/email_utils.py,sha256=j0Ug1R_0AnCg1Y92zIZ4XMwvKo3G5_pcOlBN1OH_gZs,5191
|
|
50
50
|
upgini/utils/fallback_progress_bar.py,sha256=PDaKb8dYpVZaWMroNcOHsTc3pSjgi9mOm0--cOFTwJ0,1074
|
|
51
|
+
upgini/utils/feature_info.py,sha256=Tp_2g5-rCjY4NpzKhzxwNxuqH5FFL8vG94OU5kH6wzk,6702
|
|
51
52
|
upgini/utils/features_validator.py,sha256=1Xj2ir5LzzYiX3NH8o88c2J6RTTetaTwu0MhjLTyuvM,3378
|
|
52
53
|
upgini/utils/format.py,sha256=Yv5cvvSs2bOLUzzNu96Pu33VMDNbabio92QepUj41jU,243
|
|
53
54
|
upgini/utils/ip_utils.py,sha256=Q6vb7Sr5Khx3Sq3eENjW2qCXKej_S5jZbneH6zEOkzQ,5171
|
|
@@ -58,7 +59,7 @@ upgini/utils/sklearn_ext.py,sha256=13jQS_k7v0aUtudXV6nGUEWjttPQzAW9AFYL5wgEz9k,4
|
|
|
58
59
|
upgini/utils/target_utils.py,sha256=PU77nIhTz7IHbC4rpTpxrVxib6cdpRL9F1dhkjIffLY,10225
|
|
59
60
|
upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
|
|
60
61
|
upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
|
|
61
|
-
upgini-1.2.
|
|
62
|
-
upgini-1.2.
|
|
63
|
-
upgini-1.2.
|
|
64
|
-
upgini-1.2.
|
|
62
|
+
upgini-1.2.29a6.dist-info/METADATA,sha256=7ZvyeiohsWXQQ1j_7N2H6yVKKhe4pUEpRORxtFBGcH8,48580
|
|
63
|
+
upgini-1.2.29a6.dist-info/WHEEL,sha256=zEMcRr9Kr03x1ozGwg5v9NQBKn3kndp6LSoSlVg-jhU,87
|
|
64
|
+
upgini-1.2.29a6.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
|
|
65
|
+
upgini-1.2.29a6.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|