upgini 1.2.27__py3-none-any.whl → 1.2.29__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- upgini/__about__.py +1 -1
- upgini/features_enricher.py +112 -137
- upgini/http.py +1 -1
- upgini/metadata.py +4 -4
- upgini/metrics.py +33 -9
- upgini/resource_bundle/strings.properties +3 -3
- upgini/utils/datetime_utils.py +52 -9
- upgini/utils/email_utils.py +3 -2
- upgini/utils/feature_info.py +172 -0
- upgini/utils/features_validator.py +13 -1
- {upgini-1.2.27.dist-info → upgini-1.2.29.dist-info}/METADATA +1 -1
- {upgini-1.2.27.dist-info → upgini-1.2.29.dist-info}/RECORD +14 -13
- {upgini-1.2.27.dist-info → upgini-1.2.29.dist-info}/WHEEL +0 -0
- {upgini-1.2.27.dist-info → upgini-1.2.29.dist-info}/licenses/LICENSE +0 -0
upgini/__about__.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "1.2.
|
|
1
|
+
__version__ = "1.2.29"
|
upgini/features_enricher.py
CHANGED
|
@@ -54,6 +54,7 @@ from upgini.metadata import (
|
|
|
54
54
|
SYSTEM_RECORD_ID,
|
|
55
55
|
TARGET,
|
|
56
56
|
CVType,
|
|
57
|
+
FeaturesMetadataV2,
|
|
57
58
|
FileColumnMeaningType,
|
|
58
59
|
ModelTaskType,
|
|
59
60
|
RuntimeParameters,
|
|
@@ -95,6 +96,7 @@ from upgini.utils.email_utils import (
|
|
|
95
96
|
EmailSearchKeyConverter,
|
|
96
97
|
EmailSearchKeyDetector,
|
|
97
98
|
)
|
|
99
|
+
from upgini.utils.feature_info import FeatureInfo, _round_shap_value
|
|
98
100
|
from upgini.utils.features_validator import FeaturesValidator
|
|
99
101
|
from upgini.utils.format import Format
|
|
100
102
|
from upgini.utils.ip_utils import IpSearchKeyConverter
|
|
@@ -158,6 +160,10 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
158
160
|
|
|
159
161
|
shared_datasets: list of str, optional (default=None)
|
|
160
162
|
List of private shared dataset ids for custom search
|
|
163
|
+
|
|
164
|
+
select_features: bool, optional (default=False)
|
|
165
|
+
If True, return only selected features both from input and data sources.
|
|
166
|
+
Otherwise, return all features from input and only selected features from data sources.
|
|
161
167
|
"""
|
|
162
168
|
|
|
163
169
|
TARGET_NAME = "target"
|
|
@@ -224,6 +230,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
224
230
|
client_visitorid: Optional[str] = None,
|
|
225
231
|
custom_bundle_config: Optional[str] = None,
|
|
226
232
|
add_date_if_missing: bool = True,
|
|
233
|
+
select_features: bool = False,
|
|
227
234
|
**kwargs,
|
|
228
235
|
):
|
|
229
236
|
self.bundle = get_custom_bundle(custom_bundle_config)
|
|
@@ -277,8 +284,11 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
277
284
|
self._relevant_data_sources_wo_links: pd.DataFrame = self.EMPTY_DATA_SOURCES
|
|
278
285
|
self.metrics: Optional[pd.DataFrame] = None
|
|
279
286
|
self.feature_names_ = []
|
|
287
|
+
self.dropped_client_feature_names_ = []
|
|
280
288
|
self.feature_importances_ = []
|
|
281
289
|
self.search_id = search_id
|
|
290
|
+
self.select_features = select_features
|
|
291
|
+
|
|
282
292
|
if search_id:
|
|
283
293
|
search_task = SearchTask(search_id, rest_client=self.rest_client, logger=self.logger)
|
|
284
294
|
|
|
@@ -999,9 +1009,10 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
999
1009
|
text_features=self.generate_features,
|
|
1000
1010
|
has_date=has_date,
|
|
1001
1011
|
)
|
|
1002
|
-
|
|
1012
|
+
etalon_cv_result = baseline_estimator.cross_val_predict(
|
|
1003
1013
|
fitting_X, y_sorted, self.baseline_score_column
|
|
1004
1014
|
)
|
|
1015
|
+
etalon_metric = etalon_cv_result.get_display_metric()
|
|
1005
1016
|
if etalon_metric is None:
|
|
1006
1017
|
self.logger.info(
|
|
1007
1018
|
f"Baseline {metric} on train client features is None (maybe all features was removed)"
|
|
@@ -1033,9 +1044,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1033
1044
|
text_features=self.generate_features,
|
|
1034
1045
|
has_date=has_date,
|
|
1035
1046
|
)
|
|
1036
|
-
|
|
1037
|
-
|
|
1038
|
-
|
|
1047
|
+
enriched_cv_result = enriched_estimator.cross_val_predict(fitting_enriched_X, enriched_y_sorted)
|
|
1048
|
+
enriched_metric = enriched_cv_result.get_display_metric()
|
|
1049
|
+
enriched_shaps = enriched_cv_result.shap_values
|
|
1039
1050
|
|
|
1040
1051
|
if enriched_shaps is not None:
|
|
1041
1052
|
self._update_shap_values(enriched_shaps)
|
|
@@ -1048,7 +1059,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1048
1059
|
else:
|
|
1049
1060
|
self.logger.info(f"Enriched {metric} on train combined features: {enriched_metric}")
|
|
1050
1061
|
if etalon_metric is not None and enriched_metric is not None:
|
|
1051
|
-
uplift = (
|
|
1062
|
+
uplift = (enriched_cv_result.metric - etalon_cv_result.metric) * multiplier
|
|
1052
1063
|
|
|
1053
1064
|
train_metrics = {
|
|
1054
1065
|
self.bundle.get("quality_metrics_segment_header"): self.bundle.get(
|
|
@@ -1091,9 +1102,10 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1091
1102
|
f"Calculate baseline {metric} on eval set {idx + 1} "
|
|
1092
1103
|
f"on client features: {eval_X_sorted.columns.to_list()}"
|
|
1093
1104
|
)
|
|
1094
|
-
|
|
1105
|
+
etalon_eval_results = baseline_estimator.calculate_metric(
|
|
1095
1106
|
eval_X_sorted, eval_y_sorted, self.baseline_score_column
|
|
1096
1107
|
)
|
|
1108
|
+
etalon_eval_metric = etalon_eval_results.get_display_metric()
|
|
1097
1109
|
self.logger.info(
|
|
1098
1110
|
f"Baseline {metric} on eval set {idx + 1} client features: {etalon_eval_metric}"
|
|
1099
1111
|
)
|
|
@@ -1105,9 +1117,10 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1105
1117
|
f"Calculate enriched {metric} on eval set {idx + 1} "
|
|
1106
1118
|
f"on combined features: {enriched_eval_X_sorted.columns.to_list()}"
|
|
1107
1119
|
)
|
|
1108
|
-
|
|
1120
|
+
enriched_eval_results = enriched_estimator.calculate_metric(
|
|
1109
1121
|
enriched_eval_X_sorted, enriched_eval_y_sorted
|
|
1110
1122
|
)
|
|
1123
|
+
enriched_eval_metric = enriched_eval_results.get_display_metric()
|
|
1111
1124
|
self.logger.info(
|
|
1112
1125
|
f"Enriched {metric} on eval set {idx + 1} combined features: {enriched_eval_metric}"
|
|
1113
1126
|
)
|
|
@@ -1115,7 +1128,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1115
1128
|
enriched_eval_metric = None
|
|
1116
1129
|
|
|
1117
1130
|
if etalon_eval_metric is not None and enriched_eval_metric is not None:
|
|
1118
|
-
eval_uplift = (
|
|
1131
|
+
eval_uplift = (enriched_eval_results.metric - etalon_eval_results.metric) * multiplier
|
|
1119
1132
|
else:
|
|
1120
1133
|
eval_uplift = None
|
|
1121
1134
|
|
|
@@ -1198,9 +1211,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1198
1211
|
|
|
1199
1212
|
def _update_shap_values(self, new_shaps: Dict[str, float]):
|
|
1200
1213
|
new_shaps = {
|
|
1201
|
-
feature:
|
|
1202
|
-
for feature, shap in new_shaps.items()
|
|
1203
|
-
if feature in self.feature_names_
|
|
1214
|
+
feature: _round_shap_value(shap) for feature, shap in new_shaps.items() if feature in self.feature_names_
|
|
1204
1215
|
}
|
|
1205
1216
|
features_importances = list(new_shaps.items())
|
|
1206
1217
|
features_importances.sort(key=lambda m: (-m[1], m[0]))
|
|
@@ -1249,7 +1260,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1249
1260
|
display_html_dataframe(
|
|
1250
1261
|
self.relevant_data_sources,
|
|
1251
1262
|
self._relevant_data_sources_wo_links,
|
|
1252
|
-
self.bundle.get("
|
|
1263
|
+
self.bundle.get("relevant_data_sources_header"),
|
|
1253
1264
|
display_handle=self.data_sources_display_handle,
|
|
1254
1265
|
)
|
|
1255
1266
|
except (ImportError, NameError):
|
|
@@ -1437,7 +1448,12 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1437
1448
|
client_features = [
|
|
1438
1449
|
c
|
|
1439
1450
|
for c in X_sampled.columns.to_list()
|
|
1440
|
-
if
|
|
1451
|
+
if (
|
|
1452
|
+
not self.select_features
|
|
1453
|
+
or c in self.feature_names_
|
|
1454
|
+
or (self.fit_columns_renaming is not None and self.fit_columns_renaming.get(c) in self.feature_names_)
|
|
1455
|
+
)
|
|
1456
|
+
and c
|
|
1441
1457
|
not in (
|
|
1442
1458
|
excluding_search_keys
|
|
1443
1459
|
+ list(self.fit_dropped_features)
|
|
@@ -1653,7 +1669,10 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1653
1669
|
generated_features = []
|
|
1654
1670
|
if date_column is not None:
|
|
1655
1671
|
converter = DateTimeSearchKeyConverter(date_column, self.date_format, self.logger, self.bundle)
|
|
1656
|
-
|
|
1672
|
+
# Leave original date column values
|
|
1673
|
+
df_with_date_features = converter.convert(df, keep_time=True)
|
|
1674
|
+
df_with_date_features[date_column] = df[date_column]
|
|
1675
|
+
df = df_with_date_features
|
|
1657
1676
|
generated_features = converter.generated_features
|
|
1658
1677
|
|
|
1659
1678
|
email_columns = SearchKey.find_all_keys(search_keys, SearchKey.EMAIL)
|
|
@@ -1662,9 +1681,10 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1662
1681
|
df = generator.generate(df)
|
|
1663
1682
|
generated_features.extend(generator.generated_features)
|
|
1664
1683
|
|
|
1665
|
-
normalizer = Normalizer(self.bundle, self.logger)
|
|
1666
|
-
df, search_keys, generated_features = normalizer.normalize(df, search_keys, generated_features)
|
|
1667
|
-
columns_renaming = normalizer.columns_renaming
|
|
1684
|
+
# normalizer = Normalizer(self.bundle, self.logger)
|
|
1685
|
+
# df, search_keys, generated_features = normalizer.normalize(df, search_keys, generated_features)
|
|
1686
|
+
# columns_renaming = normalizer.columns_renaming
|
|
1687
|
+
columns_renaming = {c: c for c in df.columns}
|
|
1668
1688
|
|
|
1669
1689
|
df, _ = clean_full_duplicates(df, logger=self.logger, bundle=self.bundle)
|
|
1670
1690
|
|
|
@@ -1980,9 +2000,19 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1980
2000
|
file_metadata = self._search_task.get_file_metadata(str(uuid.uuid4()))
|
|
1981
2001
|
search_keys = file_metadata.search_types()
|
|
1982
2002
|
if SearchKey.IPV6_ADDRESS in search_keys:
|
|
1983
|
-
search_keys.remove(SearchKey.IPV6_ADDRESS)
|
|
2003
|
+
# search_keys.remove(SearchKey.IPV6_ADDRESS)
|
|
2004
|
+
search_keys.pop(SearchKey.IPV6_ADDRESS, None)
|
|
1984
2005
|
|
|
1985
|
-
keys =
|
|
2006
|
+
keys = (
|
|
2007
|
+
"{"
|
|
2008
|
+
+ ", ".join(
|
|
2009
|
+
[
|
|
2010
|
+
f'"{key.name}": {{"name": "{name}", "value": "{key_example(key)}"}}'
|
|
2011
|
+
for key, name in search_keys.items()
|
|
2012
|
+
]
|
|
2013
|
+
)
|
|
2014
|
+
+ "}"
|
|
2015
|
+
)
|
|
1986
2016
|
features_for_transform = self._search_task.get_features_for_transform()
|
|
1987
2017
|
if features_for_transform:
|
|
1988
2018
|
original_features_for_transform = [
|
|
@@ -2026,7 +2056,10 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2026
2056
|
start_time = time.time()
|
|
2027
2057
|
with MDC(trace_id=trace_id):
|
|
2028
2058
|
self.logger.info("Start transform")
|
|
2029
|
-
|
|
2059
|
+
|
|
2060
|
+
validated_X = self._validate_X(X, is_transform=True)
|
|
2061
|
+
|
|
2062
|
+
self.__log_debug_information(validated_X, exclude_features_sources=exclude_features_sources)
|
|
2030
2063
|
|
|
2031
2064
|
self.__validate_search_keys(self.search_keys, self.search_id)
|
|
2032
2065
|
|
|
@@ -2058,11 +2091,11 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2058
2091
|
self.logger.info(msg)
|
|
2059
2092
|
print(msg)
|
|
2060
2093
|
|
|
2061
|
-
validated_X = self._validate_X(X, is_transform=True)
|
|
2062
|
-
|
|
2063
2094
|
is_demo_dataset = hash_input(validated_X) in DEMO_DATASET_HASHES
|
|
2064
2095
|
|
|
2065
|
-
columns_to_drop = [
|
|
2096
|
+
columns_to_drop = [
|
|
2097
|
+
c for c in validated_X.columns if c in self.feature_names_ and c in self.dropped_client_feature_names_
|
|
2098
|
+
]
|
|
2066
2099
|
if len(columns_to_drop) > 0:
|
|
2067
2100
|
msg = self.bundle.get("x_contains_enriching_columns").format(columns_to_drop)
|
|
2068
2101
|
self.logger.warning(msg)
|
|
@@ -2091,7 +2124,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2091
2124
|
date_column = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
|
|
2092
2125
|
if date_column is not None:
|
|
2093
2126
|
converter = DateTimeSearchKeyConverter(date_column, self.date_format, self.logger, bundle=self.bundle)
|
|
2094
|
-
df = converter.convert(df)
|
|
2127
|
+
df = converter.convert(df, keep_time=True)
|
|
2095
2128
|
self.logger.info(f"Date column after convertion: {df[date_column]}")
|
|
2096
2129
|
generated_features.extend(converter.generated_features)
|
|
2097
2130
|
else:
|
|
@@ -2186,11 +2219,12 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2186
2219
|
|
|
2187
2220
|
if add_fit_system_record_id:
|
|
2188
2221
|
df = self.__add_fit_system_record_id(df, search_keys, SYSTEM_RECORD_ID)
|
|
2189
|
-
if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
|
|
2190
|
-
df = df.drop(columns=DateTimeSearchKeyConverter.DATETIME_COL)
|
|
2191
2222
|
df = df.rename(columns={SYSTEM_RECORD_ID: SORT_ID})
|
|
2192
2223
|
features_not_to_pass.append(SORT_ID)
|
|
2193
2224
|
|
|
2225
|
+
if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
|
|
2226
|
+
df = df.drop(columns=DateTimeSearchKeyConverter.DATETIME_COL)
|
|
2227
|
+
|
|
2194
2228
|
# search keys might be changed after explode
|
|
2195
2229
|
columns_for_system_record_id = sorted(list(search_keys.keys()) + features_for_transform)
|
|
2196
2230
|
df[SYSTEM_RECORD_ID] = pd.util.hash_pandas_object(df[columns_for_system_record_id], index=False).astype(
|
|
@@ -2209,7 +2243,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2209
2243
|
|
|
2210
2244
|
combined_search_keys = combine_search_keys(search_keys.keys())
|
|
2211
2245
|
|
|
2212
|
-
df_without_features = df.drop(columns=features_not_to_pass)
|
|
2246
|
+
df_without_features = df.drop(columns=features_not_to_pass, errors="ignore")
|
|
2213
2247
|
|
|
2214
2248
|
df_without_features, full_duplicates_warning = clean_full_duplicates(
|
|
2215
2249
|
df_without_features, self.logger, bundle=self.bundle
|
|
@@ -2318,11 +2352,15 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2318
2352
|
else:
|
|
2319
2353
|
result = enrich()
|
|
2320
2354
|
|
|
2355
|
+
selecting_columns = [
|
|
2356
|
+
c
|
|
2357
|
+
for c in itertools.chain(validated_X.columns.tolist(), generated_features)
|
|
2358
|
+
if c not in self.dropped_client_feature_names_
|
|
2359
|
+
]
|
|
2321
2360
|
filtered_columns = self.__filtered_enriched_features(importance_threshold, max_features)
|
|
2322
|
-
|
|
2361
|
+
selecting_columns.extend(
|
|
2323
2362
|
c for c in filtered_columns if c in result.columns and c not in validated_X.columns
|
|
2324
|
-
|
|
2325
|
-
selecting_columns = validated_X.columns.tolist() + generated_features + existing_filtered_columns
|
|
2363
|
+
)
|
|
2326
2364
|
if add_fit_system_record_id:
|
|
2327
2365
|
selecting_columns.append(SORT_ID)
|
|
2328
2366
|
|
|
@@ -2476,9 +2514,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2476
2514
|
validate_scoring_argument(scoring)
|
|
2477
2515
|
|
|
2478
2516
|
self.__log_debug_information(
|
|
2479
|
-
|
|
2480
|
-
|
|
2481
|
-
|
|
2517
|
+
validated_X,
|
|
2518
|
+
validated_y,
|
|
2519
|
+
validated_eval_set,
|
|
2482
2520
|
exclude_features_sources=exclude_features_sources,
|
|
2483
2521
|
calculate_metrics=calculate_metrics,
|
|
2484
2522
|
scoring=scoring,
|
|
@@ -3489,15 +3527,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3489
3527
|
|
|
3490
3528
|
return result_train, result_eval_sets
|
|
3491
3529
|
|
|
3492
|
-
@staticmethod
|
|
3493
|
-
def _round_shap_value(shap: float) -> float:
|
|
3494
|
-
if shap > 0.0 and shap < 0.0001:
|
|
3495
|
-
return 0.0001
|
|
3496
|
-
else:
|
|
3497
|
-
return round(shap, 4)
|
|
3498
|
-
|
|
3499
3530
|
def __prepare_feature_importances(self, trace_id: str, x_columns: List[str], silent=False):
|
|
3500
|
-
llm_source = "LLM with external data augmentation"
|
|
3501
3531
|
if self._search_task is None:
|
|
3502
3532
|
raise NotFittedError(self.bundle.get("transform_unfitted_enricher"))
|
|
3503
3533
|
features_meta = self._search_task.get_all_features_metadata_v2()
|
|
@@ -3508,116 +3538,40 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3508
3538
|
features_df = self._search_task.get_all_initial_raw_features(trace_id, metrics_calculation=True)
|
|
3509
3539
|
|
|
3510
3540
|
self.feature_names_ = []
|
|
3541
|
+
self.dropped_client_feature_names_ = []
|
|
3511
3542
|
self.feature_importances_ = []
|
|
3512
3543
|
features_info = []
|
|
3513
3544
|
features_info_without_links = []
|
|
3514
3545
|
internal_features_info = []
|
|
3515
3546
|
|
|
3516
|
-
def list_or_single(lst: List[str], single: str):
|
|
3517
|
-
return lst or ([single] if single else [])
|
|
3518
|
-
|
|
3519
|
-
def to_anchor(link: str, value: str) -> str:
|
|
3520
|
-
if not value:
|
|
3521
|
-
return ""
|
|
3522
|
-
elif not link:
|
|
3523
|
-
return value
|
|
3524
|
-
elif value == llm_source:
|
|
3525
|
-
return value
|
|
3526
|
-
else:
|
|
3527
|
-
return f"<a href='{link}' target='_blank' rel='noopener noreferrer'>{value}</a>"
|
|
3528
|
-
|
|
3529
|
-
def make_links(names: List[str], links: List[str]):
|
|
3530
|
-
all_links = [to_anchor(link, name) for name, link in itertools.zip_longest(names, links)]
|
|
3531
|
-
return ",".join(all_links)
|
|
3532
|
-
|
|
3533
3547
|
features_meta.sort(key=lambda m: (-m.shap_value, m.name))
|
|
3534
3548
|
for feature_meta in features_meta:
|
|
3535
3549
|
if feature_meta.name in original_names_dict.keys():
|
|
3536
3550
|
feature_meta.name = original_names_dict[feature_meta.name]
|
|
3537
|
-
|
|
3551
|
+
|
|
3552
|
+
is_client_feature = feature_meta.name in x_columns
|
|
3553
|
+
|
|
3554
|
+
if feature_meta.shap_value == 0.0:
|
|
3555
|
+
if self.select_features:
|
|
3556
|
+
self.dropped_client_feature_names_.append(feature_meta.name)
|
|
3557
|
+
continue
|
|
3558
|
+
|
|
3559
|
+
# Use only important features
|
|
3538
3560
|
if (
|
|
3539
|
-
feature_meta.name in
|
|
3561
|
+
feature_meta.name in self.fit_generated_features
|
|
3540
3562
|
or feature_meta.name == COUNTRY
|
|
3541
|
-
|
|
3542
|
-
or
|
|
3563
|
+
# In select_features mode we select also from etalon features and need to show them
|
|
3564
|
+
or (not self.select_features and is_client_feature)
|
|
3543
3565
|
):
|
|
3544
3566
|
continue
|
|
3545
3567
|
|
|
3546
|
-
feature_sample = []
|
|
3547
3568
|
self.feature_names_.append(feature_meta.name)
|
|
3548
|
-
self.feature_importances_.append(
|
|
3549
|
-
if feature_meta.name in features_df.columns:
|
|
3550
|
-
feature_sample = np.random.choice(features_df[feature_meta.name].dropna().unique(), 3).tolist()
|
|
3551
|
-
if len(feature_sample) > 0 and isinstance(feature_sample[0], float):
|
|
3552
|
-
feature_sample = [round(f, 4) for f in feature_sample]
|
|
3553
|
-
feature_sample = [str(f) for f in feature_sample]
|
|
3554
|
-
feature_sample = ", ".join(feature_sample)
|
|
3555
|
-
if len(feature_sample) > 30:
|
|
3556
|
-
feature_sample = feature_sample[:30] + "..."
|
|
3557
|
-
|
|
3558
|
-
internal_provider = feature_meta.data_provider or "Upgini"
|
|
3559
|
-
providers = list_or_single(feature_meta.data_providers, feature_meta.data_provider)
|
|
3560
|
-
provider_links = list_or_single(feature_meta.data_provider_links, feature_meta.data_provider_link)
|
|
3561
|
-
if providers:
|
|
3562
|
-
provider = make_links(providers, provider_links)
|
|
3563
|
-
else:
|
|
3564
|
-
provider = to_anchor("https://upgini.com", "Upgini")
|
|
3565
|
-
|
|
3566
|
-
internal_source = feature_meta.data_source or (
|
|
3567
|
-
llm_source
|
|
3568
|
-
if not feature_meta.name.endswith("_country") and not feature_meta.name.endswith("_postal_code")
|
|
3569
|
-
else ""
|
|
3570
|
-
)
|
|
3571
|
-
sources = list_or_single(feature_meta.data_sources, feature_meta.data_source)
|
|
3572
|
-
source_links = list_or_single(feature_meta.data_source_links, feature_meta.data_source_link)
|
|
3573
|
-
if sources:
|
|
3574
|
-
source = make_links(sources, source_links)
|
|
3575
|
-
else:
|
|
3576
|
-
source = internal_source
|
|
3569
|
+
self.feature_importances_.append(_round_shap_value(feature_meta.shap_value))
|
|
3577
3570
|
|
|
3578
|
-
|
|
3579
|
-
|
|
3580
|
-
|
|
3581
|
-
|
|
3582
|
-
feature_name = internal_feature_name
|
|
3583
|
-
|
|
3584
|
-
features_info.append(
|
|
3585
|
-
{
|
|
3586
|
-
self.bundle.get("features_info_name"): feature_name,
|
|
3587
|
-
self.bundle.get("features_info_shap"): self._round_shap_value(feature_meta.shap_value),
|
|
3588
|
-
self.bundle.get("features_info_hitrate"): feature_meta.hit_rate,
|
|
3589
|
-
self.bundle.get("features_info_value_preview"): feature_sample,
|
|
3590
|
-
self.bundle.get("features_info_provider"): provider,
|
|
3591
|
-
self.bundle.get("features_info_source"): source,
|
|
3592
|
-
self.bundle.get("features_info_update_frequency"): feature_meta.update_frequency,
|
|
3593
|
-
}
|
|
3594
|
-
)
|
|
3595
|
-
features_info_without_links.append(
|
|
3596
|
-
{
|
|
3597
|
-
self.bundle.get("features_info_name"): internal_feature_name,
|
|
3598
|
-
self.bundle.get("features_info_shap"): self._round_shap_value(feature_meta.shap_value),
|
|
3599
|
-
self.bundle.get("features_info_hitrate"): feature_meta.hit_rate,
|
|
3600
|
-
self.bundle.get("features_info_value_preview"): feature_sample,
|
|
3601
|
-
self.bundle.get("features_info_provider"): internal_provider,
|
|
3602
|
-
self.bundle.get("features_info_source"): internal_source,
|
|
3603
|
-
self.bundle.get("features_info_update_frequency"): feature_meta.update_frequency,
|
|
3604
|
-
}
|
|
3605
|
-
)
|
|
3606
|
-
internal_features_info.append(
|
|
3607
|
-
{
|
|
3608
|
-
self.bundle.get("features_info_name"): internal_feature_name,
|
|
3609
|
-
"feature_link": feature_meta.doc_link,
|
|
3610
|
-
self.bundle.get("features_info_shap"): self._round_shap_value(feature_meta.shap_value),
|
|
3611
|
-
self.bundle.get("features_info_hitrate"): feature_meta.hit_rate,
|
|
3612
|
-
self.bundle.get("features_info_value_preview"): feature_sample,
|
|
3613
|
-
self.bundle.get("features_info_provider"): internal_provider,
|
|
3614
|
-
"provider_link": feature_meta.data_provider_link,
|
|
3615
|
-
self.bundle.get("features_info_source"): internal_source,
|
|
3616
|
-
"source_link": feature_meta.data_source_link,
|
|
3617
|
-
self.bundle.get("features_info_commercial_schema"): feature_meta.commercial_schema or "",
|
|
3618
|
-
self.bundle.get("features_info_update_frequency"): feature_meta.update_frequency,
|
|
3619
|
-
}
|
|
3620
|
-
)
|
|
3571
|
+
feature_info = FeatureInfo.from_metadata(feature_meta, features_df, is_client_feature)
|
|
3572
|
+
features_info.append(feature_info.to_row(self.bundle))
|
|
3573
|
+
features_info_without_links.append(feature_info.to_row_without_links(self.bundle))
|
|
3574
|
+
internal_features_info.append(feature_info.to_internal_row(self.bundle))
|
|
3621
3575
|
|
|
3622
3576
|
if len(features_info) > 0:
|
|
3623
3577
|
self.features_info = pd.DataFrame(features_info)
|
|
@@ -3642,7 +3596,22 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3642
3596
|
autofe_meta = self._search_task.get_autofe_metadata()
|
|
3643
3597
|
if autofe_meta is None:
|
|
3644
3598
|
return None
|
|
3645
|
-
|
|
3599
|
+
if len(self._internal_features_info) != 0:
|
|
3600
|
+
|
|
3601
|
+
def to_feature_meta(row):
|
|
3602
|
+
fm = FeaturesMetadataV2(
|
|
3603
|
+
name=row[bundle.get("features_info_name")],
|
|
3604
|
+
type="",
|
|
3605
|
+
source="",
|
|
3606
|
+
hit_rate=bundle.get("features_info_hitrate"),
|
|
3607
|
+
shap_value=bundle.get("features_info_shap"),
|
|
3608
|
+
data_source=bundle.get("features_info_source"),
|
|
3609
|
+
)
|
|
3610
|
+
return fm
|
|
3611
|
+
|
|
3612
|
+
features_meta = self._internal_features_info.apply(to_feature_meta).to_list()
|
|
3613
|
+
else:
|
|
3614
|
+
features_meta = self._search_task.get_all_features_metadata_v2()
|
|
3646
3615
|
|
|
3647
3616
|
def get_feature_by_name(name: str):
|
|
3648
3617
|
for m in features_meta:
|
|
@@ -3762,11 +3731,17 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3762
3731
|
if len(passed_unsupported_search_keys) > 0:
|
|
3763
3732
|
raise ValidationError(self.bundle.get("unsupported_search_key").format(passed_unsupported_search_keys))
|
|
3764
3733
|
|
|
3734
|
+
x_columns = [
|
|
3735
|
+
c
|
|
3736
|
+
for c in x.columns
|
|
3737
|
+
if c not in [TARGET, EVAL_SET_INDEX, SYSTEM_RECORD_ID, ENTITY_SYSTEM_RECORD_ID, SEARCH_KEY_UNNEST]
|
|
3738
|
+
]
|
|
3739
|
+
|
|
3765
3740
|
for column_id, meaning_type in search_keys.items():
|
|
3766
3741
|
column_name = None
|
|
3767
3742
|
if isinstance(column_id, str):
|
|
3768
3743
|
if column_id not in x.columns:
|
|
3769
|
-
raise ValidationError(self.bundle.get("search_key_not_found").format(column_id,
|
|
3744
|
+
raise ValidationError(self.bundle.get("search_key_not_found").format(column_id, x_columns))
|
|
3770
3745
|
column_name = column_id
|
|
3771
3746
|
valid_search_keys[column_name] = meaning_type
|
|
3772
3747
|
elif isinstance(column_id, int):
|
upgini/http.py
CHANGED
|
@@ -882,7 +882,7 @@ class _RestClient:
|
|
|
882
882
|
if content_type:
|
|
883
883
|
headers[_RestClient.CONTENT_TYPE_HEADER_NAME] = content_type
|
|
884
884
|
if trace_id:
|
|
885
|
-
headers[_RestClient.TRACE_ID_HEADER_NAME] = trace_id
|
|
885
|
+
headers[_RestClient.TRACE_ID_HEADER_NAME] = str(trace_id)
|
|
886
886
|
for header_key, header_value in additional_headers.items():
|
|
887
887
|
headers[header_key] = header_value
|
|
888
888
|
return headers
|
upgini/metadata.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
from enum import Enum
|
|
4
|
-
from typing import Any, Dict, List, Optional,
|
|
4
|
+
from typing import Any, Dict, List, Optional, Union
|
|
5
5
|
|
|
6
6
|
from pydantic import BaseModel
|
|
7
7
|
|
|
@@ -228,13 +228,13 @@ class FileMetadata(BaseModel):
|
|
|
228
228
|
return c
|
|
229
229
|
return None
|
|
230
230
|
|
|
231
|
-
def search_types(self) ->
|
|
232
|
-
search_keys =
|
|
231
|
+
def search_types(self) -> Dict[SearchKey, str]:
|
|
232
|
+
search_keys = dict()
|
|
233
233
|
for keys_group in self.searchKeys:
|
|
234
234
|
for key in keys_group:
|
|
235
235
|
column = self.column_by_name(key)
|
|
236
236
|
if column:
|
|
237
|
-
search_keys
|
|
237
|
+
search_keys[SearchKey.from_meaning_type(column.meaningType)] = column.name
|
|
238
238
|
return search_keys
|
|
239
239
|
|
|
240
240
|
|
upgini/metrics.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
+
from dataclasses import dataclass
|
|
3
4
|
import inspect
|
|
4
5
|
import logging
|
|
5
6
|
import re
|
|
@@ -210,6 +211,21 @@ SUPPORTED_CATBOOST_METRICS = {
|
|
|
210
211
|
}
|
|
211
212
|
|
|
212
213
|
|
|
214
|
+
@dataclass
|
|
215
|
+
class _CrossValResults:
|
|
216
|
+
metric: Optional[float]
|
|
217
|
+
metric_std: Optional[float]
|
|
218
|
+
shap_values: Optional[Dict[str, float]]
|
|
219
|
+
|
|
220
|
+
def get_display_metric(self) -> Optional[str]:
|
|
221
|
+
if self.metric is None:
|
|
222
|
+
return None
|
|
223
|
+
elif self.metric_std is None:
|
|
224
|
+
return f"{self.metric:.3f}"
|
|
225
|
+
else:
|
|
226
|
+
return f"{self.metric:.3f} ± {self.metric_std:.3f}"
|
|
227
|
+
|
|
228
|
+
|
|
213
229
|
class EstimatorWrapper:
|
|
214
230
|
def __init__(
|
|
215
231
|
self,
|
|
@@ -297,11 +313,11 @@ class EstimatorWrapper:
|
|
|
297
313
|
|
|
298
314
|
def cross_val_predict(
|
|
299
315
|
self, x: pd.DataFrame, y: np.ndarray, baseline_score_column: Optional[Any] = None
|
|
300
|
-
) ->
|
|
316
|
+
) -> _CrossValResults:
|
|
301
317
|
x, y, groups, fit_params = self._prepare_to_fit(x, y)
|
|
302
318
|
|
|
303
319
|
if x.shape[1] == 0:
|
|
304
|
-
return None
|
|
320
|
+
return _CrossValResults(metric=None, metric_std=None, shap_values=None)
|
|
305
321
|
|
|
306
322
|
scorer = check_scoring(self.estimator, scoring=self.scorer)
|
|
307
323
|
|
|
@@ -326,7 +342,7 @@ class EstimatorWrapper:
|
|
|
326
342
|
|
|
327
343
|
self.check_fold_metrics(metrics_by_fold)
|
|
328
344
|
|
|
329
|
-
metric =
|
|
345
|
+
metric, metric_std = self._calculate_metric_from_folds(metrics_by_fold)
|
|
330
346
|
|
|
331
347
|
splits = self.cv.split(x, y, groups)
|
|
332
348
|
|
|
@@ -351,7 +367,7 @@ class EstimatorWrapper:
|
|
|
351
367
|
else:
|
|
352
368
|
average_shap_values = None
|
|
353
369
|
|
|
354
|
-
return
|
|
370
|
+
return _CrossValResults(metric=metric, metric_std=metric_std, shap_values=average_shap_values)
|
|
355
371
|
|
|
356
372
|
def process_shap_values(self, shap_values: Dict[str, float]) -> Dict[str, float]:
|
|
357
373
|
return shap_values
|
|
@@ -367,17 +383,25 @@ class EstimatorWrapper:
|
|
|
367
383
|
metric = 2 * metric - 1
|
|
368
384
|
return metric
|
|
369
385
|
|
|
370
|
-
def calculate_metric(
|
|
386
|
+
def calculate_metric(
|
|
387
|
+
self, x: pd.DataFrame, y: np.ndarray, baseline_score_column: Optional[Any] = None
|
|
388
|
+
) -> _CrossValResults:
|
|
371
389
|
x, y, _ = self._prepare_to_calculate(x, y)
|
|
372
390
|
if baseline_score_column is not None and self.metric_name == "GINI":
|
|
373
|
-
metric = roc_auc_score(y, x[baseline_score_column])
|
|
391
|
+
metric, metric_std = roc_auc_score(y, x[baseline_score_column]), None
|
|
374
392
|
else:
|
|
375
393
|
metrics = []
|
|
376
394
|
for est in self.cv_estimators:
|
|
377
395
|
metrics.append(self.scorer(est, x, y))
|
|
378
396
|
|
|
379
|
-
metric =
|
|
380
|
-
return
|
|
397
|
+
metric, metric_std = self._calculate_metric_from_folds(metrics)
|
|
398
|
+
return _CrossValResults(metric=metric, metric_std=metric_std, shap_values=None)
|
|
399
|
+
|
|
400
|
+
def _calculate_metric_from_folds(self, metrics_by_fold: List[float]) -> Tuple[float, float]:
|
|
401
|
+
metrics_by_fold = [self.post_process_metric(m) for m in metrics_by_fold]
|
|
402
|
+
metric = np.mean(metrics_by_fold) * self.multiplier
|
|
403
|
+
metric_std = np.std(metrics_by_fold) * np.abs(self.multiplier)
|
|
404
|
+
return metric, metric_std
|
|
381
405
|
|
|
382
406
|
@staticmethod
|
|
383
407
|
def create(
|
|
@@ -591,7 +615,7 @@ class CatBoostWrapper(EstimatorWrapper):
|
|
|
591
615
|
|
|
592
616
|
def cross_val_predict(
|
|
593
617
|
self, x: pd.DataFrame, y: np.ndarray, baseline_score_column: Optional[Any] = None
|
|
594
|
-
) ->
|
|
618
|
+
) -> _CrossValResults:
|
|
595
619
|
try:
|
|
596
620
|
return super().cross_val_predict(x, y, baseline_score_column)
|
|
597
621
|
except Exception as e:
|
|
@@ -82,7 +82,7 @@ unregistered_only_personal_keys=Only personal search keys used. Api_key from pro
|
|
|
82
82
|
search_key_not_found=Column `{}` from search_keys was not found in X dataframe: {}
|
|
83
83
|
numeric_search_key_not_found=Index {} in search_keys is out of bounds for {} columns of X dataframe
|
|
84
84
|
unsupported_search_key_type=Unsupported type of key in search_keys: {}
|
|
85
|
-
unsupported_type_of_search_key=Unsupported type of search key: {}. It should be a member of
|
|
85
|
+
unsupported_type_of_search_key=Unsupported type of search key: {}. It should be a member of SearchKey
|
|
86
86
|
search_key_country_and_country_code=SearchKey.COUNTRY and country_code parameter were passed simultaniously. Parameter country_code will be ignored
|
|
87
87
|
empty_search_key=Search key {} is empty. Please fill values or remove this search key
|
|
88
88
|
single_constant_search_key=Constant value detected for the {} search key in the X dataframe: {}.\nThat search key will add constant features for different y values.\nPlease add extra search keys with non constant values, like the COUNTRY, POSTAL_CODE, DATE, PHONE NUMBER, EMAIL/HEM or IPv4
|
|
@@ -201,7 +201,7 @@ email_detected=Emails detected in column `{}`. It will be used as a search key\n
|
|
|
201
201
|
email_detected_not_registered=Emails detected in column `{}`. It can be used only with api_key from profile.upgini.com\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns
|
|
202
202
|
phone_detected=Phone numbers detected in column `{}`. It can be used only with api_key from profile.upgini.com\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns
|
|
203
203
|
phone_detected_not_registered=Phone numbers detected in column `{}`. It can be used only with api_key from profile.upgini.com\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns
|
|
204
|
-
target_type_detected
|
|
204
|
+
target_type_detected=\nDetected task type: {}. Reason: {}\nYou can set task type manually with argument `model_task_type` of FeaturesEnricher constructor if task type detected incorrectly\n
|
|
205
205
|
binary_target_reason=only two unique label-values observed
|
|
206
206
|
non_numeric_multiclass_reason=non-numeric label values observed
|
|
207
207
|
few_unique_label_multiclass_reason=few unique label-values observed and can be considered as categorical
|
|
@@ -212,7 +212,7 @@ limited_int_multiclass_reason=integer-like values with limited unique values obs
|
|
|
212
212
|
all_ok_community_invite=❓ Support request
|
|
213
213
|
too_small_for_metrics=Your train dataset or one of eval datasets contains less than 500 rows. For such dataset Upgini will not calculate accuracy metrics. Please increase the number of rows in the training dataset to calculate accuracy metrics
|
|
214
214
|
imbalance_multiclass=Class {0} is on 25% quantile of classes distribution ({1} records in train dataset). \nDownsample classes with records more than {1}.
|
|
215
|
-
imbalanced_target
|
|
215
|
+
imbalanced_target=\nTarget is imbalanced and will be undersampled. Frequency of the rarest class `{}` is {}
|
|
216
216
|
loss_selection_info=Using loss `{}` for feature selection
|
|
217
217
|
loss_calc_metrics_info=Using loss `{}` for metrics calculation with default estimator
|
|
218
218
|
|
upgini/utils/datetime_utils.py
CHANGED
|
@@ -109,20 +109,63 @@ class DateTimeSearchKeyConverter:
|
|
|
109
109
|
|
|
110
110
|
df = self.clean_old_dates(df)
|
|
111
111
|
|
|
112
|
+
# Define function to apply sine and cosine transformations
|
|
113
|
+
def add_cyclical_features(df, column, period):
|
|
114
|
+
period_suffix = f"_{period}" if column != "day_in_quarter" else ""
|
|
115
|
+
sin_feature = f"datetime_{column}_sin{period_suffix}"
|
|
116
|
+
cos_feature = f"datetime_{column}_cos{period_suffix}"
|
|
117
|
+
if sin_feature not in df.columns:
|
|
118
|
+
df[sin_feature] = np.sin(2 * np.pi * df[column] / period)
|
|
119
|
+
self.generated_features.append(sin_feature)
|
|
120
|
+
if cos_feature not in df.columns:
|
|
121
|
+
df[cos_feature] = np.cos(2 * np.pi * df[column] / period)
|
|
122
|
+
self.generated_features.append(cos_feature)
|
|
123
|
+
|
|
124
|
+
df["quarter"] = df[self.date_column].dt.quarter
|
|
125
|
+
|
|
126
|
+
# Calculate the start date of the quarter for each timestamp
|
|
127
|
+
df["quarter_start"] = df[self.date_column].dt.to_period("Q").dt.start_time
|
|
128
|
+
|
|
129
|
+
# Calculate the day in the quarter
|
|
130
|
+
df["day_in_quarter"] = (df[self.date_column] - df["quarter_start"]).dt.days + 1
|
|
131
|
+
|
|
132
|
+
# Vectorized calculation of days_in_quarter
|
|
133
|
+
quarter = df["quarter"]
|
|
134
|
+
start = df["quarter_start"]
|
|
135
|
+
year = start.dt.year
|
|
136
|
+
month = start.dt.month
|
|
137
|
+
|
|
138
|
+
quarter_end_year = np.where(quarter == 4, year + 1, year)
|
|
139
|
+
quarter_end_month = np.where(quarter == 4, 1, month + 3)
|
|
140
|
+
|
|
141
|
+
end = pd.to_datetime({"year": quarter_end_year, "month": quarter_end_month, "day": 1})
|
|
142
|
+
end.index = df.index
|
|
143
|
+
|
|
144
|
+
df["days_in_quarter"] = (end - start).dt.days
|
|
145
|
+
|
|
146
|
+
add_cyclical_features(df, "day_in_quarter", df["days_in_quarter"]) # Days in the quarter
|
|
147
|
+
|
|
148
|
+
df.drop(columns=["quarter", "quarter_start", "day_in_quarter", "days_in_quarter"], inplace=True)
|
|
149
|
+
|
|
112
150
|
df[seconds] = (df[self.date_column] - df[self.date_column].dt.floor("D")).dt.seconds
|
|
113
151
|
|
|
114
152
|
seconds_without_na = df[seconds].dropna()
|
|
115
153
|
if (seconds_without_na != 0).any() and seconds_without_na.nunique() > 1:
|
|
116
154
|
self.logger.info("Time found in date search key. Add extra features based on time")
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
155
|
+
|
|
156
|
+
# Extract basic components
|
|
157
|
+
df["second"] = df[self.date_column].dt.second
|
|
158
|
+
df["minute"] = df[self.date_column].dt.minute
|
|
159
|
+
df["hour"] = df[self.date_column].dt.hour
|
|
160
|
+
|
|
161
|
+
# Apply cyclical transformations
|
|
162
|
+
add_cyclical_features(df, "second", 60) # Seconds in a minute
|
|
163
|
+
add_cyclical_features(df, "minute", 60) # Minutes in an hour
|
|
164
|
+
add_cyclical_features(df, "minute", 30) # Minutes in half an hour
|
|
165
|
+
add_cyclical_features(df, "hour", 24) # Hours in a day
|
|
166
|
+
|
|
167
|
+
# Drop intermediate columns if not needed
|
|
168
|
+
df.drop(columns=["second", "minute", "hour"], inplace=True)
|
|
126
169
|
|
|
127
170
|
df.drop(columns=seconds, inplace=True)
|
|
128
171
|
|
upgini/utils/email_utils.py
CHANGED
|
@@ -38,8 +38,9 @@ class EmailDomainGenerator:
|
|
|
38
38
|
def generate(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
39
39
|
for email_col in self.email_columns:
|
|
40
40
|
domain_feature = email_col + self.DOMAIN_SUFFIX
|
|
41
|
-
|
|
42
|
-
|
|
41
|
+
if domain_feature not in df.columns:
|
|
42
|
+
df[domain_feature] = df[email_col].apply(self._email_to_domain).astype("string")
|
|
43
|
+
self.generated_features.append(domain_feature)
|
|
43
44
|
return df
|
|
44
45
|
|
|
45
46
|
@staticmethod
|
|
@@ -0,0 +1,172 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
import itertools
|
|
3
|
+
from typing import Dict, List
|
|
4
|
+
|
|
5
|
+
import numpy as np
|
|
6
|
+
import pandas as pd
|
|
7
|
+
|
|
8
|
+
from upgini.metadata import FeaturesMetadataV2
|
|
9
|
+
from upgini.resource_bundle import ResourceBundle
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
LLM_SOURCE = "LLM with external data augmentation"
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@dataclass
|
|
16
|
+
class FeatureInfo:
|
|
17
|
+
name: str
|
|
18
|
+
internal_name: str
|
|
19
|
+
rounded_shap: float
|
|
20
|
+
hitrate: float
|
|
21
|
+
value_preview: str
|
|
22
|
+
provider: str
|
|
23
|
+
internal_provider: str
|
|
24
|
+
source: str
|
|
25
|
+
internal_source: str
|
|
26
|
+
update_frequency: str
|
|
27
|
+
commercial_schema: str
|
|
28
|
+
doc_link: str
|
|
29
|
+
data_provider_link: str
|
|
30
|
+
data_source_link: str
|
|
31
|
+
|
|
32
|
+
@staticmethod
|
|
33
|
+
def from_metadata(feature_meta: FeaturesMetadataV2, data: pd.DataFrame, is_client_feature: bool) -> "FeatureInfo":
|
|
34
|
+
return FeatureInfo(
|
|
35
|
+
name=_get_name(feature_meta),
|
|
36
|
+
internal_name=_get_internal_name(feature_meta),
|
|
37
|
+
rounded_shap=_round_shap_value(feature_meta.shap_value),
|
|
38
|
+
hitrate=feature_meta.hit_rate,
|
|
39
|
+
value_preview=_get_feature_sample(feature_meta, data),
|
|
40
|
+
provider=_get_provider(feature_meta, is_client_feature),
|
|
41
|
+
internal_provider=_get_internal_provider(feature_meta, is_client_feature),
|
|
42
|
+
source=_get_source(feature_meta, is_client_feature),
|
|
43
|
+
internal_source=_get_internal_source(feature_meta, is_client_feature),
|
|
44
|
+
update_frequency=feature_meta.update_frequency,
|
|
45
|
+
commercial_schema=feature_meta.commercial_schema,
|
|
46
|
+
doc_link=feature_meta.doc_link,
|
|
47
|
+
data_provider_link=feature_meta.data_provider_link,
|
|
48
|
+
data_source_link=feature_meta.data_source_link,
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
def to_row(self, bundle: ResourceBundle) -> Dict[str, str]:
|
|
52
|
+
return {
|
|
53
|
+
bundle.get("features_info_name"): self.name,
|
|
54
|
+
bundle.get("features_info_shap"): self.rounded_shap,
|
|
55
|
+
bundle.get("features_info_hitrate"): self.hitrate,
|
|
56
|
+
bundle.get("features_info_value_preview"): self.value_preview,
|
|
57
|
+
bundle.get("features_info_provider"): self.provider,
|
|
58
|
+
bundle.get("features_info_source"): self.source,
|
|
59
|
+
bundle.get("features_info_update_frequency"): self.update_frequency,
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
def to_row_without_links(self, bundle: ResourceBundle) -> Dict[str, str]:
|
|
63
|
+
return {
|
|
64
|
+
bundle.get("features_info_name"): self.internal_name,
|
|
65
|
+
bundle.get("features_info_shap"): self.rounded_shap,
|
|
66
|
+
bundle.get("features_info_hitrate"): self.hitrate,
|
|
67
|
+
bundle.get("features_info_value_preview"): self.value_preview,
|
|
68
|
+
bundle.get("features_info_provider"): self.internal_provider,
|
|
69
|
+
bundle.get("features_info_source"): self.internal_source,
|
|
70
|
+
bundle.get("features_info_update_frequency"): self.update_frequency,
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
def to_internal_row(self, bundle: ResourceBundle) -> Dict[str, str]:
|
|
74
|
+
return {
|
|
75
|
+
bundle.get("features_info_name"): self.internal_name,
|
|
76
|
+
"feature_link": self.doc_link,
|
|
77
|
+
bundle.get("features_info_shap"): self.rounded_shap,
|
|
78
|
+
bundle.get("features_info_hitrate"): self.hitrate,
|
|
79
|
+
bundle.get("features_info_value_preview"): self.value_preview,
|
|
80
|
+
bundle.get("features_info_provider"): self.internal_provider,
|
|
81
|
+
"provider_link": self.data_provider_link,
|
|
82
|
+
bundle.get("features_info_source"): self.internal_source,
|
|
83
|
+
"source_link": self.data_source_link,
|
|
84
|
+
bundle.get("features_info_commercial_schema"): self.commercial_schema or "",
|
|
85
|
+
bundle.get("features_info_update_frequency"): self.update_frequency,
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def _get_feature_sample(feature_meta: FeaturesMetadataV2, data: pd.DataFrame) -> str:
|
|
90
|
+
if feature_meta.name in data.columns:
|
|
91
|
+
feature_sample = np.random.choice(data[feature_meta.name].dropna().unique(), 3).tolist()
|
|
92
|
+
if len(feature_sample) > 0 and isinstance(feature_sample[0], float):
|
|
93
|
+
feature_sample = [round(f, 4) for f in feature_sample]
|
|
94
|
+
feature_sample = [str(f) for f in feature_sample]
|
|
95
|
+
feature_sample = ", ".join(feature_sample)
|
|
96
|
+
if len(feature_sample) > 30:
|
|
97
|
+
feature_sample = feature_sample[:30] + "..."
|
|
98
|
+
else:
|
|
99
|
+
feature_sample = ""
|
|
100
|
+
return feature_sample
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def _get_name(feature_meta: FeaturesMetadataV2) -> str:
|
|
104
|
+
if feature_meta.doc_link:
|
|
105
|
+
return _to_anchor(feature_meta.doc_link, feature_meta.name)
|
|
106
|
+
else:
|
|
107
|
+
return feature_meta.name
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def _get_internal_name(feature_meta: FeaturesMetadataV2) -> str:
|
|
111
|
+
return feature_meta.name
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def _get_provider(feature_meta: FeaturesMetadataV2, is_client_feature: bool) -> str:
|
|
115
|
+
providers = _list_or_single(feature_meta.data_providers, feature_meta.data_provider)
|
|
116
|
+
provider_links = _list_or_single(feature_meta.data_provider_links, feature_meta.data_provider_link)
|
|
117
|
+
if providers:
|
|
118
|
+
provider = _make_links(providers, provider_links)
|
|
119
|
+
else:
|
|
120
|
+
provider = "" if is_client_feature else _to_anchor("https://upgini.com", "Upgini")
|
|
121
|
+
return provider
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def _get_internal_provider(feature_meta: FeaturesMetadataV2, is_client_feature: bool) -> str:
|
|
125
|
+
return "" if is_client_feature else (feature_meta.data_provider or "Upgini")
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def _get_source(feature_meta: FeaturesMetadataV2, is_client_feature: bool) -> str:
|
|
129
|
+
sources = _list_or_single(feature_meta.data_sources, feature_meta.data_source)
|
|
130
|
+
source_links = _list_or_single(feature_meta.data_source_links, feature_meta.data_source_link)
|
|
131
|
+
if sources:
|
|
132
|
+
source = _make_links(sources, source_links)
|
|
133
|
+
else:
|
|
134
|
+
source = _get_internal_source(feature_meta, is_client_feature)
|
|
135
|
+
return source
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
def _get_internal_source(feature_meta: FeaturesMetadataV2, is_client_feature: bool) -> str:
|
|
139
|
+
return feature_meta.data_source or (
|
|
140
|
+
LLM_SOURCE
|
|
141
|
+
if not feature_meta.name.endswith("_country")
|
|
142
|
+
and not feature_meta.name.endswith("_postal_code")
|
|
143
|
+
and not is_client_feature
|
|
144
|
+
else ""
|
|
145
|
+
)
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def _list_or_single(lst: List[str], single: str):
|
|
149
|
+
return lst or ([single] if single else [])
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
def _to_anchor(link: str, value: str) -> str:
|
|
153
|
+
if not value:
|
|
154
|
+
return ""
|
|
155
|
+
elif not link:
|
|
156
|
+
return value
|
|
157
|
+
elif value == LLM_SOURCE:
|
|
158
|
+
return value
|
|
159
|
+
else:
|
|
160
|
+
return f"<a href='{link}' target='_blank' rel='noopener noreferrer'>{value}</a>"
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
def _make_links(names: List[str], links: List[str]):
|
|
164
|
+
all_links = [_to_anchor(link, name) for name, link in itertools.zip_longest(names, links)]
|
|
165
|
+
return ",".join(all_links)
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
def _round_shap_value(shap: float) -> float:
|
|
169
|
+
if shap > 0.0 and shap < 0.0001:
|
|
170
|
+
return 0.0001
|
|
171
|
+
else:
|
|
172
|
+
return round(shap, 4)
|
|
@@ -2,6 +2,7 @@ import logging
|
|
|
2
2
|
from logging import Logger
|
|
3
3
|
from typing import Dict, List, Optional, Tuple
|
|
4
4
|
|
|
5
|
+
import numpy as np
|
|
5
6
|
import pandas as pd
|
|
6
7
|
from pandas.api.types import is_integer_dtype, is_object_dtype, is_string_dtype
|
|
7
8
|
|
|
@@ -83,10 +84,21 @@ class FeaturesValidator:
|
|
|
83
84
|
return [
|
|
84
85
|
i
|
|
85
86
|
for i in df
|
|
86
|
-
if (is_object_dtype(df[i]) or is_string_dtype(df[i]) or
|
|
87
|
+
if (is_object_dtype(df[i]) or is_string_dtype(df[i]) or FeaturesValidator.__is_integer(df[i]))
|
|
87
88
|
and (df[i].nunique(dropna=False) / row_count >= 0.85)
|
|
88
89
|
]
|
|
89
90
|
|
|
91
|
+
@staticmethod
|
|
92
|
+
def __is_integer(series: pd.Series) -> bool:
|
|
93
|
+
return (
|
|
94
|
+
is_integer_dtype(series)
|
|
95
|
+
or series.dropna()
|
|
96
|
+
.apply(
|
|
97
|
+
lambda f: (float.is_integer(f) and abs(f) < np.iinfo(np.int64).max) if isinstance(f, float) else False
|
|
98
|
+
)
|
|
99
|
+
.all()
|
|
100
|
+
)
|
|
101
|
+
|
|
90
102
|
@staticmethod
|
|
91
103
|
def find_constant_features(df: pd.DataFrame) -> List[str]:
|
|
92
104
|
return [i for i in df if df[i].nunique() <= 1]
|
|
@@ -1,13 +1,13 @@
|
|
|
1
|
-
upgini/__about__.py,sha256=
|
|
1
|
+
upgini/__about__.py,sha256=kkc0PkaP1QFrTEPI8N5OtZ0p2wkfpteTOMyPLNGAXgk,23
|
|
2
2
|
upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
|
|
3
3
|
upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
|
|
4
4
|
upgini/dataset.py,sha256=iPFiMJtk4HF1ytw9wCQr8H9RfoOKj_TIo8XYZKWgcMc,31331
|
|
5
5
|
upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
|
|
6
|
-
upgini/features_enricher.py,sha256=
|
|
7
|
-
upgini/http.py,sha256=
|
|
6
|
+
upgini/features_enricher.py,sha256=zcIEKzgiUX46KdtWlMl-15Dz32shXVgscvsQkULusoU,192228
|
|
7
|
+
upgini/http.py,sha256=plZGTGoi1h2edd8Cnjt4eYB8t4NbBGnZz7DtPTByiNc,42885
|
|
8
8
|
upgini/lazy_import.py,sha256=74gQ8JuA48BGRLxAo7lNHNKY2D2emMxrUxKGdxVGhuY,1012
|
|
9
|
-
upgini/metadata.py,sha256=
|
|
10
|
-
upgini/metrics.py,sha256=
|
|
9
|
+
upgini/metadata.py,sha256=lUa2xYhBhnCeTqNt6lWc9iP_YuikYGIsDSn8Vwyjv1I,11235
|
|
10
|
+
upgini/metrics.py,sha256=hr7UwLphbZ_FEglLuO2lzr_pFgxOJ4c3WBeg7H-fNqY,35521
|
|
11
11
|
upgini/search_task.py,sha256=qxUxAD-bed-FpZYmTB_4orW7YJsW_O6a1TcgnZIRFr4,17307
|
|
12
12
|
upgini/spinner.py,sha256=4iMd-eIe_BnkqFEMIliULTbj6rNI2HkN_VJ4qYe0cUc,1118
|
|
13
13
|
upgini/version_validator.py,sha256=h1GViOWzULy5vf6M4dpTJuIk-4V38UCrTY1sb9yLa5I,1594
|
|
@@ -30,7 +30,7 @@ upgini/normalizer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU
|
|
|
30
30
|
upgini/normalizer/normalize_utils.py,sha256=Ft2MwSgVoBilXAORAOYAuwPD79GOLfwn4qQE3IUFzzg,7218
|
|
31
31
|
upgini/resource_bundle/__init__.py,sha256=S5F2G47pnJd2LDpmFsjDqEwiKkP8Hm-hcseDbMka6Ko,8345
|
|
32
32
|
upgini/resource_bundle/exceptions.py,sha256=5fRvx0_vWdE1-7HcSgF0tckB4A9AKyf5RiinZkInTsI,621
|
|
33
|
-
upgini/resource_bundle/strings.properties,sha256=
|
|
33
|
+
upgini/resource_bundle/strings.properties,sha256=fOAeLTsnx8xvJK-7RPFXprATG0n56jeCdse8sQTuVX8,26674
|
|
34
34
|
upgini/resource_bundle/strings_widget.properties,sha256=gOdqvZWntP2LCza_tyVk1_yRYcG4c04K9sQOAVhF_gw,1577
|
|
35
35
|
upgini/sampler/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
36
36
|
upgini/sampler/base.py,sha256=7GpjYqjOp58vYcJLiX__1R5wjUlyQbxvHJ2klFnup_M,6389
|
|
@@ -43,12 +43,13 @@ upgini/utils/blocked_time_series.py,sha256=Uqr3vp4YqNclj2-PzEYqVy763GSXHn86sbpIl
|
|
|
43
43
|
upgini/utils/country_utils.py,sha256=lY-eXWwFVegdVENFttbvLcgGDjFO17Sex8hd2PyJaRk,6937
|
|
44
44
|
upgini/utils/custom_loss_utils.py,sha256=kieNZYBYZm5ZGBltF1F_jOSF4ea6C29rYuCyiDcqVNY,3857
|
|
45
45
|
upgini/utils/cv_utils.py,sha256=w6FQb9nO8BWDx88EF83NpjPLarK4eR4ia0Wg0kLBJC4,3525
|
|
46
|
-
upgini/utils/datetime_utils.py,sha256=
|
|
46
|
+
upgini/utils/datetime_utils.py,sha256=F61i2vZCB6eUy4WwodDyPi50XKPbhOHsxDrU6tGa6CM,13133
|
|
47
47
|
upgini/utils/deduplicate_utils.py,sha256=SMZx9IKIhWI5HqXepfKiQb3uDJrogQZtG6jcWuMo5Z4,8855
|
|
48
48
|
upgini/utils/display_utils.py,sha256=DsBjJ8jEYAh8BPgfAbzq5imoGFV6IACP20PQ78BQCX0,11964
|
|
49
|
-
upgini/utils/email_utils.py,sha256=
|
|
49
|
+
upgini/utils/email_utils.py,sha256=GbnhHJn1nhUBytmK6PophYqaoq4t7Lp6i0-O0Gd3RV8,5265
|
|
50
50
|
upgini/utils/fallback_progress_bar.py,sha256=PDaKb8dYpVZaWMroNcOHsTc3pSjgi9mOm0--cOFTwJ0,1074
|
|
51
|
-
upgini/utils/
|
|
51
|
+
upgini/utils/feature_info.py,sha256=Tp_2g5-rCjY4NpzKhzxwNxuqH5FFL8vG94OU5kH6wzk,6702
|
|
52
|
+
upgini/utils/features_validator.py,sha256=lEfmk4DoxZ4ooOE1HC0ZXtUb_lFKRFHIrnFULZ4_rL8,3746
|
|
52
53
|
upgini/utils/format.py,sha256=Yv5cvvSs2bOLUzzNu96Pu33VMDNbabio92QepUj41jU,243
|
|
53
54
|
upgini/utils/ip_utils.py,sha256=Q6vb7Sr5Khx3Sq3eENjW2qCXKej_S5jZbneH6zEOkzQ,5171
|
|
54
55
|
upgini/utils/phone_utils.py,sha256=IrbztLuOJBiePqqxllfABWfYlfAjYevPhXKipl95wUI,10432
|
|
@@ -58,7 +59,7 @@ upgini/utils/sklearn_ext.py,sha256=13jQS_k7v0aUtudXV6nGUEWjttPQzAW9AFYL5wgEz9k,4
|
|
|
58
59
|
upgini/utils/target_utils.py,sha256=PU77nIhTz7IHbC4rpTpxrVxib6cdpRL9F1dhkjIffLY,10225
|
|
59
60
|
upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
|
|
60
61
|
upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
|
|
61
|
-
upgini-1.2.
|
|
62
|
-
upgini-1.2.
|
|
63
|
-
upgini-1.2.
|
|
64
|
-
upgini-1.2.
|
|
62
|
+
upgini-1.2.29.dist-info/METADATA,sha256=i3crJ_plUCfgF91rGJXv_slwHAANMRUps6kRrNjIIso,48578
|
|
63
|
+
upgini-1.2.29.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
|
|
64
|
+
upgini-1.2.29.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
|
|
65
|
+
upgini-1.2.29.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|