upgini 1.2.28__tar.gz → 1.2.29__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- {upgini-1.2.28 → upgini-1.2.29}/PKG-INFO +1 -1
- upgini-1.2.29/src/upgini/__about__.py +1 -0
- {upgini-1.2.28 → upgini-1.2.29}/src/upgini/features_enricher.py +98 -130
- {upgini-1.2.28 → upgini-1.2.29}/src/upgini/http.py +1 -1
- {upgini-1.2.28 → upgini-1.2.29}/src/upgini/metadata.py +4 -4
- {upgini-1.2.28 → upgini-1.2.29}/src/upgini/metrics.py +33 -9
- {upgini-1.2.28 → upgini-1.2.29}/src/upgini/resource_bundle/strings.properties +1 -1
- {upgini-1.2.28 → upgini-1.2.29}/src/upgini/utils/datetime_utils.py +52 -9
- {upgini-1.2.28 → upgini-1.2.29}/src/upgini/utils/email_utils.py +3 -2
- upgini-1.2.29/src/upgini/utils/feature_info.py +172 -0
- {upgini-1.2.28 → upgini-1.2.29}/src/upgini/utils/features_validator.py +13 -1
- upgini-1.2.28/src/upgini/__about__.py +0 -1
- {upgini-1.2.28 → upgini-1.2.29}/.gitignore +0 -0
- {upgini-1.2.28 → upgini-1.2.29}/LICENSE +0 -0
- {upgini-1.2.28 → upgini-1.2.29}/README.md +0 -0
- {upgini-1.2.28 → upgini-1.2.29}/pyproject.toml +0 -0
- {upgini-1.2.28 → upgini-1.2.29}/src/upgini/__init__.py +0 -0
- {upgini-1.2.28 → upgini-1.2.29}/src/upgini/ads.py +0 -0
- {upgini-1.2.28 → upgini-1.2.29}/src/upgini/ads_management/__init__.py +0 -0
- {upgini-1.2.28 → upgini-1.2.29}/src/upgini/ads_management/ads_manager.py +0 -0
- {upgini-1.2.28 → upgini-1.2.29}/src/upgini/autofe/__init__.py +0 -0
- {upgini-1.2.28 → upgini-1.2.29}/src/upgini/autofe/all_operands.py +0 -0
- {upgini-1.2.28 → upgini-1.2.29}/src/upgini/autofe/binary.py +0 -0
- {upgini-1.2.28 → upgini-1.2.29}/src/upgini/autofe/date.py +0 -0
- {upgini-1.2.28 → upgini-1.2.29}/src/upgini/autofe/feature.py +0 -0
- {upgini-1.2.28 → upgini-1.2.29}/src/upgini/autofe/groupby.py +0 -0
- {upgini-1.2.28 → upgini-1.2.29}/src/upgini/autofe/operand.py +0 -0
- {upgini-1.2.28 → upgini-1.2.29}/src/upgini/autofe/unary.py +0 -0
- {upgini-1.2.28 → upgini-1.2.29}/src/upgini/autofe/vector.py +0 -0
- {upgini-1.2.28 → upgini-1.2.29}/src/upgini/data_source/__init__.py +0 -0
- {upgini-1.2.28 → upgini-1.2.29}/src/upgini/data_source/data_source_publisher.py +0 -0
- {upgini-1.2.28 → upgini-1.2.29}/src/upgini/dataset.py +0 -0
- {upgini-1.2.28 → upgini-1.2.29}/src/upgini/errors.py +0 -0
- {upgini-1.2.28 → upgini-1.2.29}/src/upgini/lazy_import.py +0 -0
- {upgini-1.2.28 → upgini-1.2.29}/src/upgini/mdc/__init__.py +0 -0
- {upgini-1.2.28 → upgini-1.2.29}/src/upgini/mdc/context.py +0 -0
- {upgini-1.2.28 → upgini-1.2.29}/src/upgini/normalizer/__init__.py +0 -0
- {upgini-1.2.28 → upgini-1.2.29}/src/upgini/normalizer/normalize_utils.py +0 -0
- {upgini-1.2.28 → upgini-1.2.29}/src/upgini/resource_bundle/__init__.py +0 -0
- {upgini-1.2.28 → upgini-1.2.29}/src/upgini/resource_bundle/exceptions.py +0 -0
- {upgini-1.2.28 → upgini-1.2.29}/src/upgini/resource_bundle/strings_widget.properties +0 -0
- {upgini-1.2.28 → upgini-1.2.29}/src/upgini/sampler/__init__.py +0 -0
- {upgini-1.2.28 → upgini-1.2.29}/src/upgini/sampler/base.py +0 -0
- {upgini-1.2.28 → upgini-1.2.29}/src/upgini/sampler/random_under_sampler.py +0 -0
- {upgini-1.2.28 → upgini-1.2.29}/src/upgini/sampler/utils.py +0 -0
- {upgini-1.2.28 → upgini-1.2.29}/src/upgini/search_task.py +0 -0
- {upgini-1.2.28 → upgini-1.2.29}/src/upgini/spinner.py +0 -0
- {upgini-1.2.28 → upgini-1.2.29}/src/upgini/utils/Roboto-Regular.ttf +0 -0
- {upgini-1.2.28 → upgini-1.2.29}/src/upgini/utils/__init__.py +0 -0
- {upgini-1.2.28 → upgini-1.2.29}/src/upgini/utils/base_search_key_detector.py +0 -0
- {upgini-1.2.28 → upgini-1.2.29}/src/upgini/utils/blocked_time_series.py +0 -0
- {upgini-1.2.28 → upgini-1.2.29}/src/upgini/utils/country_utils.py +0 -0
- {upgini-1.2.28 → upgini-1.2.29}/src/upgini/utils/custom_loss_utils.py +0 -0
- {upgini-1.2.28 → upgini-1.2.29}/src/upgini/utils/cv_utils.py +0 -0
- {upgini-1.2.28 → upgini-1.2.29}/src/upgini/utils/deduplicate_utils.py +0 -0
- {upgini-1.2.28 → upgini-1.2.29}/src/upgini/utils/display_utils.py +0 -0
- {upgini-1.2.28 → upgini-1.2.29}/src/upgini/utils/fallback_progress_bar.py +0 -0
- {upgini-1.2.28 → upgini-1.2.29}/src/upgini/utils/format.py +0 -0
- {upgini-1.2.28 → upgini-1.2.29}/src/upgini/utils/ip_utils.py +0 -0
- {upgini-1.2.28 → upgini-1.2.29}/src/upgini/utils/phone_utils.py +0 -0
- {upgini-1.2.28 → upgini-1.2.29}/src/upgini/utils/postal_code_utils.py +0 -0
- {upgini-1.2.28 → upgini-1.2.29}/src/upgini/utils/progress_bar.py +0 -0
- {upgini-1.2.28 → upgini-1.2.29}/src/upgini/utils/sklearn_ext.py +0 -0
- {upgini-1.2.28 → upgini-1.2.29}/src/upgini/utils/target_utils.py +0 -0
- {upgini-1.2.28 → upgini-1.2.29}/src/upgini/utils/track_info.py +0 -0
- {upgini-1.2.28 → upgini-1.2.29}/src/upgini/utils/warning_counter.py +0 -0
- {upgini-1.2.28 → upgini-1.2.29}/src/upgini/version_validator.py +0 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "1.2.29"
|
|
@@ -54,6 +54,7 @@ from upgini.metadata import (
|
|
|
54
54
|
SYSTEM_RECORD_ID,
|
|
55
55
|
TARGET,
|
|
56
56
|
CVType,
|
|
57
|
+
FeaturesMetadataV2,
|
|
57
58
|
FileColumnMeaningType,
|
|
58
59
|
ModelTaskType,
|
|
59
60
|
RuntimeParameters,
|
|
@@ -95,6 +96,7 @@ from upgini.utils.email_utils import (
|
|
|
95
96
|
EmailSearchKeyConverter,
|
|
96
97
|
EmailSearchKeyDetector,
|
|
97
98
|
)
|
|
99
|
+
from upgini.utils.feature_info import FeatureInfo, _round_shap_value
|
|
98
100
|
from upgini.utils.features_validator import FeaturesValidator
|
|
99
101
|
from upgini.utils.format import Format
|
|
100
102
|
from upgini.utils.ip_utils import IpSearchKeyConverter
|
|
@@ -158,6 +160,10 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
158
160
|
|
|
159
161
|
shared_datasets: list of str, optional (default=None)
|
|
160
162
|
List of private shared dataset ids for custom search
|
|
163
|
+
|
|
164
|
+
select_features: bool, optional (default=False)
|
|
165
|
+
If True, return only selected features both from input and data sources.
|
|
166
|
+
Otherwise, return all features from input and only selected features from data sources.
|
|
161
167
|
"""
|
|
162
168
|
|
|
163
169
|
TARGET_NAME = "target"
|
|
@@ -224,6 +230,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
224
230
|
client_visitorid: Optional[str] = None,
|
|
225
231
|
custom_bundle_config: Optional[str] = None,
|
|
226
232
|
add_date_if_missing: bool = True,
|
|
233
|
+
select_features: bool = False,
|
|
227
234
|
**kwargs,
|
|
228
235
|
):
|
|
229
236
|
self.bundle = get_custom_bundle(custom_bundle_config)
|
|
@@ -277,8 +284,11 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
277
284
|
self._relevant_data_sources_wo_links: pd.DataFrame = self.EMPTY_DATA_SOURCES
|
|
278
285
|
self.metrics: Optional[pd.DataFrame] = None
|
|
279
286
|
self.feature_names_ = []
|
|
287
|
+
self.dropped_client_feature_names_ = []
|
|
280
288
|
self.feature_importances_ = []
|
|
281
289
|
self.search_id = search_id
|
|
290
|
+
self.select_features = select_features
|
|
291
|
+
|
|
282
292
|
if search_id:
|
|
283
293
|
search_task = SearchTask(search_id, rest_client=self.rest_client, logger=self.logger)
|
|
284
294
|
|
|
@@ -999,9 +1009,10 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
999
1009
|
text_features=self.generate_features,
|
|
1000
1010
|
has_date=has_date,
|
|
1001
1011
|
)
|
|
1002
|
-
|
|
1012
|
+
etalon_cv_result = baseline_estimator.cross_val_predict(
|
|
1003
1013
|
fitting_X, y_sorted, self.baseline_score_column
|
|
1004
1014
|
)
|
|
1015
|
+
etalon_metric = etalon_cv_result.get_display_metric()
|
|
1005
1016
|
if etalon_metric is None:
|
|
1006
1017
|
self.logger.info(
|
|
1007
1018
|
f"Baseline {metric} on train client features is None (maybe all features was removed)"
|
|
@@ -1033,9 +1044,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1033
1044
|
text_features=self.generate_features,
|
|
1034
1045
|
has_date=has_date,
|
|
1035
1046
|
)
|
|
1036
|
-
|
|
1037
|
-
|
|
1038
|
-
|
|
1047
|
+
enriched_cv_result = enriched_estimator.cross_val_predict(fitting_enriched_X, enriched_y_sorted)
|
|
1048
|
+
enriched_metric = enriched_cv_result.get_display_metric()
|
|
1049
|
+
enriched_shaps = enriched_cv_result.shap_values
|
|
1039
1050
|
|
|
1040
1051
|
if enriched_shaps is not None:
|
|
1041
1052
|
self._update_shap_values(enriched_shaps)
|
|
@@ -1048,7 +1059,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1048
1059
|
else:
|
|
1049
1060
|
self.logger.info(f"Enriched {metric} on train combined features: {enriched_metric}")
|
|
1050
1061
|
if etalon_metric is not None and enriched_metric is not None:
|
|
1051
|
-
uplift = (
|
|
1062
|
+
uplift = (enriched_cv_result.metric - etalon_cv_result.metric) * multiplier
|
|
1052
1063
|
|
|
1053
1064
|
train_metrics = {
|
|
1054
1065
|
self.bundle.get("quality_metrics_segment_header"): self.bundle.get(
|
|
@@ -1091,9 +1102,10 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1091
1102
|
f"Calculate baseline {metric} on eval set {idx + 1} "
|
|
1092
1103
|
f"on client features: {eval_X_sorted.columns.to_list()}"
|
|
1093
1104
|
)
|
|
1094
|
-
|
|
1105
|
+
etalon_eval_results = baseline_estimator.calculate_metric(
|
|
1095
1106
|
eval_X_sorted, eval_y_sorted, self.baseline_score_column
|
|
1096
1107
|
)
|
|
1108
|
+
etalon_eval_metric = etalon_eval_results.get_display_metric()
|
|
1097
1109
|
self.logger.info(
|
|
1098
1110
|
f"Baseline {metric} on eval set {idx + 1} client features: {etalon_eval_metric}"
|
|
1099
1111
|
)
|
|
@@ -1105,9 +1117,10 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1105
1117
|
f"Calculate enriched {metric} on eval set {idx + 1} "
|
|
1106
1118
|
f"on combined features: {enriched_eval_X_sorted.columns.to_list()}"
|
|
1107
1119
|
)
|
|
1108
|
-
|
|
1120
|
+
enriched_eval_results = enriched_estimator.calculate_metric(
|
|
1109
1121
|
enriched_eval_X_sorted, enriched_eval_y_sorted
|
|
1110
1122
|
)
|
|
1123
|
+
enriched_eval_metric = enriched_eval_results.get_display_metric()
|
|
1111
1124
|
self.logger.info(
|
|
1112
1125
|
f"Enriched {metric} on eval set {idx + 1} combined features: {enriched_eval_metric}"
|
|
1113
1126
|
)
|
|
@@ -1115,7 +1128,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1115
1128
|
enriched_eval_metric = None
|
|
1116
1129
|
|
|
1117
1130
|
if etalon_eval_metric is not None and enriched_eval_metric is not None:
|
|
1118
|
-
eval_uplift = (
|
|
1131
|
+
eval_uplift = (enriched_eval_results.metric - etalon_eval_results.metric) * multiplier
|
|
1119
1132
|
else:
|
|
1120
1133
|
eval_uplift = None
|
|
1121
1134
|
|
|
@@ -1198,9 +1211,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1198
1211
|
|
|
1199
1212
|
def _update_shap_values(self, new_shaps: Dict[str, float]):
|
|
1200
1213
|
new_shaps = {
|
|
1201
|
-
feature:
|
|
1202
|
-
for feature, shap in new_shaps.items()
|
|
1203
|
-
if feature in self.feature_names_
|
|
1214
|
+
feature: _round_shap_value(shap) for feature, shap in new_shaps.items() if feature in self.feature_names_
|
|
1204
1215
|
}
|
|
1205
1216
|
features_importances = list(new_shaps.items())
|
|
1206
1217
|
features_importances.sort(key=lambda m: (-m[1], m[0]))
|
|
@@ -1249,7 +1260,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1249
1260
|
display_html_dataframe(
|
|
1250
1261
|
self.relevant_data_sources,
|
|
1251
1262
|
self._relevant_data_sources_wo_links,
|
|
1252
|
-
self.bundle.get("
|
|
1263
|
+
self.bundle.get("relevant_data_sources_header"),
|
|
1253
1264
|
display_handle=self.data_sources_display_handle,
|
|
1254
1265
|
)
|
|
1255
1266
|
except (ImportError, NameError):
|
|
@@ -1437,7 +1448,12 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1437
1448
|
client_features = [
|
|
1438
1449
|
c
|
|
1439
1450
|
for c in X_sampled.columns.to_list()
|
|
1440
|
-
if
|
|
1451
|
+
if (
|
|
1452
|
+
not self.select_features
|
|
1453
|
+
or c in self.feature_names_
|
|
1454
|
+
or (self.fit_columns_renaming is not None and self.fit_columns_renaming.get(c) in self.feature_names_)
|
|
1455
|
+
)
|
|
1456
|
+
and c
|
|
1441
1457
|
not in (
|
|
1442
1458
|
excluding_search_keys
|
|
1443
1459
|
+ list(self.fit_dropped_features)
|
|
@@ -1653,7 +1669,10 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1653
1669
|
generated_features = []
|
|
1654
1670
|
if date_column is not None:
|
|
1655
1671
|
converter = DateTimeSearchKeyConverter(date_column, self.date_format, self.logger, self.bundle)
|
|
1656
|
-
|
|
1672
|
+
# Leave original date column values
|
|
1673
|
+
df_with_date_features = converter.convert(df, keep_time=True)
|
|
1674
|
+
df_with_date_features[date_column] = df[date_column]
|
|
1675
|
+
df = df_with_date_features
|
|
1657
1676
|
generated_features = converter.generated_features
|
|
1658
1677
|
|
|
1659
1678
|
email_columns = SearchKey.find_all_keys(search_keys, SearchKey.EMAIL)
|
|
@@ -1662,9 +1681,10 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1662
1681
|
df = generator.generate(df)
|
|
1663
1682
|
generated_features.extend(generator.generated_features)
|
|
1664
1683
|
|
|
1665
|
-
normalizer = Normalizer(self.bundle, self.logger)
|
|
1666
|
-
df, search_keys, generated_features = normalizer.normalize(df, search_keys, generated_features)
|
|
1667
|
-
columns_renaming = normalizer.columns_renaming
|
|
1684
|
+
# normalizer = Normalizer(self.bundle, self.logger)
|
|
1685
|
+
# df, search_keys, generated_features = normalizer.normalize(df, search_keys, generated_features)
|
|
1686
|
+
# columns_renaming = normalizer.columns_renaming
|
|
1687
|
+
columns_renaming = {c: c for c in df.columns}
|
|
1668
1688
|
|
|
1669
1689
|
df, _ = clean_full_duplicates(df, logger=self.logger, bundle=self.bundle)
|
|
1670
1690
|
|
|
@@ -1980,9 +2000,19 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1980
2000
|
file_metadata = self._search_task.get_file_metadata(str(uuid.uuid4()))
|
|
1981
2001
|
search_keys = file_metadata.search_types()
|
|
1982
2002
|
if SearchKey.IPV6_ADDRESS in search_keys:
|
|
1983
|
-
search_keys.remove(SearchKey.IPV6_ADDRESS)
|
|
2003
|
+
# search_keys.remove(SearchKey.IPV6_ADDRESS)
|
|
2004
|
+
search_keys.pop(SearchKey.IPV6_ADDRESS, None)
|
|
1984
2005
|
|
|
1985
|
-
keys =
|
|
2006
|
+
keys = (
|
|
2007
|
+
"{"
|
|
2008
|
+
+ ", ".join(
|
|
2009
|
+
[
|
|
2010
|
+
f'"{key.name}": {{"name": "{name}", "value": "{key_example(key)}"}}'
|
|
2011
|
+
for key, name in search_keys.items()
|
|
2012
|
+
]
|
|
2013
|
+
)
|
|
2014
|
+
+ "}"
|
|
2015
|
+
)
|
|
1986
2016
|
features_for_transform = self._search_task.get_features_for_transform()
|
|
1987
2017
|
if features_for_transform:
|
|
1988
2018
|
original_features_for_transform = [
|
|
@@ -2063,7 +2093,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2063
2093
|
|
|
2064
2094
|
is_demo_dataset = hash_input(validated_X) in DEMO_DATASET_HASHES
|
|
2065
2095
|
|
|
2066
|
-
columns_to_drop = [
|
|
2096
|
+
columns_to_drop = [
|
|
2097
|
+
c for c in validated_X.columns if c in self.feature_names_ and c in self.dropped_client_feature_names_
|
|
2098
|
+
]
|
|
2067
2099
|
if len(columns_to_drop) > 0:
|
|
2068
2100
|
msg = self.bundle.get("x_contains_enriching_columns").format(columns_to_drop)
|
|
2069
2101
|
self.logger.warning(msg)
|
|
@@ -2092,7 +2124,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2092
2124
|
date_column = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
|
|
2093
2125
|
if date_column is not None:
|
|
2094
2126
|
converter = DateTimeSearchKeyConverter(date_column, self.date_format, self.logger, bundle=self.bundle)
|
|
2095
|
-
df = converter.convert(df)
|
|
2127
|
+
df = converter.convert(df, keep_time=True)
|
|
2096
2128
|
self.logger.info(f"Date column after convertion: {df[date_column]}")
|
|
2097
2129
|
generated_features.extend(converter.generated_features)
|
|
2098
2130
|
else:
|
|
@@ -2187,11 +2219,12 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2187
2219
|
|
|
2188
2220
|
if add_fit_system_record_id:
|
|
2189
2221
|
df = self.__add_fit_system_record_id(df, search_keys, SYSTEM_RECORD_ID)
|
|
2190
|
-
if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
|
|
2191
|
-
df = df.drop(columns=DateTimeSearchKeyConverter.DATETIME_COL)
|
|
2192
2222
|
df = df.rename(columns={SYSTEM_RECORD_ID: SORT_ID})
|
|
2193
2223
|
features_not_to_pass.append(SORT_ID)
|
|
2194
2224
|
|
|
2225
|
+
if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
|
|
2226
|
+
df = df.drop(columns=DateTimeSearchKeyConverter.DATETIME_COL)
|
|
2227
|
+
|
|
2195
2228
|
# search keys might be changed after explode
|
|
2196
2229
|
columns_for_system_record_id = sorted(list(search_keys.keys()) + features_for_transform)
|
|
2197
2230
|
df[SYSTEM_RECORD_ID] = pd.util.hash_pandas_object(df[columns_for_system_record_id], index=False).astype(
|
|
@@ -2210,7 +2243,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2210
2243
|
|
|
2211
2244
|
combined_search_keys = combine_search_keys(search_keys.keys())
|
|
2212
2245
|
|
|
2213
|
-
df_without_features = df.drop(columns=features_not_to_pass)
|
|
2246
|
+
df_without_features = df.drop(columns=features_not_to_pass, errors="ignore")
|
|
2214
2247
|
|
|
2215
2248
|
df_without_features, full_duplicates_warning = clean_full_duplicates(
|
|
2216
2249
|
df_without_features, self.logger, bundle=self.bundle
|
|
@@ -2319,11 +2352,15 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2319
2352
|
else:
|
|
2320
2353
|
result = enrich()
|
|
2321
2354
|
|
|
2355
|
+
selecting_columns = [
|
|
2356
|
+
c
|
|
2357
|
+
for c in itertools.chain(validated_X.columns.tolist(), generated_features)
|
|
2358
|
+
if c not in self.dropped_client_feature_names_
|
|
2359
|
+
]
|
|
2322
2360
|
filtered_columns = self.__filtered_enriched_features(importance_threshold, max_features)
|
|
2323
|
-
|
|
2361
|
+
selecting_columns.extend(
|
|
2324
2362
|
c for c in filtered_columns if c in result.columns and c not in validated_X.columns
|
|
2325
|
-
|
|
2326
|
-
selecting_columns = validated_X.columns.tolist() + generated_features + existing_filtered_columns
|
|
2363
|
+
)
|
|
2327
2364
|
if add_fit_system_record_id:
|
|
2328
2365
|
selecting_columns.append(SORT_ID)
|
|
2329
2366
|
|
|
@@ -3490,15 +3527,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3490
3527
|
|
|
3491
3528
|
return result_train, result_eval_sets
|
|
3492
3529
|
|
|
3493
|
-
@staticmethod
|
|
3494
|
-
def _round_shap_value(shap: float) -> float:
|
|
3495
|
-
if shap > 0.0 and shap < 0.0001:
|
|
3496
|
-
return 0.0001
|
|
3497
|
-
else:
|
|
3498
|
-
return round(shap, 4)
|
|
3499
|
-
|
|
3500
3530
|
def __prepare_feature_importances(self, trace_id: str, x_columns: List[str], silent=False):
|
|
3501
|
-
llm_source = "LLM with external data augmentation"
|
|
3502
3531
|
if self._search_task is None:
|
|
3503
3532
|
raise NotFittedError(self.bundle.get("transform_unfitted_enricher"))
|
|
3504
3533
|
features_meta = self._search_task.get_all_features_metadata_v2()
|
|
@@ -3509,116 +3538,40 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3509
3538
|
features_df = self._search_task.get_all_initial_raw_features(trace_id, metrics_calculation=True)
|
|
3510
3539
|
|
|
3511
3540
|
self.feature_names_ = []
|
|
3541
|
+
self.dropped_client_feature_names_ = []
|
|
3512
3542
|
self.feature_importances_ = []
|
|
3513
3543
|
features_info = []
|
|
3514
3544
|
features_info_without_links = []
|
|
3515
3545
|
internal_features_info = []
|
|
3516
3546
|
|
|
3517
|
-
def list_or_single(lst: List[str], single: str):
|
|
3518
|
-
return lst or ([single] if single else [])
|
|
3519
|
-
|
|
3520
|
-
def to_anchor(link: str, value: str) -> str:
|
|
3521
|
-
if not value:
|
|
3522
|
-
return ""
|
|
3523
|
-
elif not link:
|
|
3524
|
-
return value
|
|
3525
|
-
elif value == llm_source:
|
|
3526
|
-
return value
|
|
3527
|
-
else:
|
|
3528
|
-
return f"<a href='{link}' target='_blank' rel='noopener noreferrer'>{value}</a>"
|
|
3529
|
-
|
|
3530
|
-
def make_links(names: List[str], links: List[str]):
|
|
3531
|
-
all_links = [to_anchor(link, name) for name, link in itertools.zip_longest(names, links)]
|
|
3532
|
-
return ",".join(all_links)
|
|
3533
|
-
|
|
3534
3547
|
features_meta.sort(key=lambda m: (-m.shap_value, m.name))
|
|
3535
3548
|
for feature_meta in features_meta:
|
|
3536
3549
|
if feature_meta.name in original_names_dict.keys():
|
|
3537
3550
|
feature_meta.name = original_names_dict[feature_meta.name]
|
|
3538
|
-
|
|
3551
|
+
|
|
3552
|
+
is_client_feature = feature_meta.name in x_columns
|
|
3553
|
+
|
|
3554
|
+
if feature_meta.shap_value == 0.0:
|
|
3555
|
+
if self.select_features:
|
|
3556
|
+
self.dropped_client_feature_names_.append(feature_meta.name)
|
|
3557
|
+
continue
|
|
3558
|
+
|
|
3559
|
+
# Use only important features
|
|
3539
3560
|
if (
|
|
3540
|
-
feature_meta.name in
|
|
3561
|
+
feature_meta.name in self.fit_generated_features
|
|
3541
3562
|
or feature_meta.name == COUNTRY
|
|
3542
|
-
|
|
3543
|
-
or
|
|
3563
|
+
# In select_features mode we select also from etalon features and need to show them
|
|
3564
|
+
or (not self.select_features and is_client_feature)
|
|
3544
3565
|
):
|
|
3545
3566
|
continue
|
|
3546
3567
|
|
|
3547
|
-
feature_sample = []
|
|
3548
3568
|
self.feature_names_.append(feature_meta.name)
|
|
3549
|
-
self.feature_importances_.append(
|
|
3550
|
-
if feature_meta.name in features_df.columns:
|
|
3551
|
-
feature_sample = np.random.choice(features_df[feature_meta.name].dropna().unique(), 3).tolist()
|
|
3552
|
-
if len(feature_sample) > 0 and isinstance(feature_sample[0], float):
|
|
3553
|
-
feature_sample = [round(f, 4) for f in feature_sample]
|
|
3554
|
-
feature_sample = [str(f) for f in feature_sample]
|
|
3555
|
-
feature_sample = ", ".join(feature_sample)
|
|
3556
|
-
if len(feature_sample) > 30:
|
|
3557
|
-
feature_sample = feature_sample[:30] + "..."
|
|
3558
|
-
|
|
3559
|
-
internal_provider = feature_meta.data_provider or "Upgini"
|
|
3560
|
-
providers = list_or_single(feature_meta.data_providers, feature_meta.data_provider)
|
|
3561
|
-
provider_links = list_or_single(feature_meta.data_provider_links, feature_meta.data_provider_link)
|
|
3562
|
-
if providers:
|
|
3563
|
-
provider = make_links(providers, provider_links)
|
|
3564
|
-
else:
|
|
3565
|
-
provider = to_anchor("https://upgini.com", "Upgini")
|
|
3569
|
+
self.feature_importances_.append(_round_shap_value(feature_meta.shap_value))
|
|
3566
3570
|
|
|
3567
|
-
|
|
3568
|
-
|
|
3569
|
-
|
|
3570
|
-
|
|
3571
|
-
)
|
|
3572
|
-
sources = list_or_single(feature_meta.data_sources, feature_meta.data_source)
|
|
3573
|
-
source_links = list_or_single(feature_meta.data_source_links, feature_meta.data_source_link)
|
|
3574
|
-
if sources:
|
|
3575
|
-
source = make_links(sources, source_links)
|
|
3576
|
-
else:
|
|
3577
|
-
source = internal_source
|
|
3578
|
-
|
|
3579
|
-
internal_feature_name = feature_meta.name
|
|
3580
|
-
if feature_meta.doc_link:
|
|
3581
|
-
feature_name = to_anchor(feature_meta.doc_link, feature_meta.name)
|
|
3582
|
-
else:
|
|
3583
|
-
feature_name = internal_feature_name
|
|
3584
|
-
|
|
3585
|
-
features_info.append(
|
|
3586
|
-
{
|
|
3587
|
-
self.bundle.get("features_info_name"): feature_name,
|
|
3588
|
-
self.bundle.get("features_info_shap"): self._round_shap_value(feature_meta.shap_value),
|
|
3589
|
-
self.bundle.get("features_info_hitrate"): feature_meta.hit_rate,
|
|
3590
|
-
self.bundle.get("features_info_value_preview"): feature_sample,
|
|
3591
|
-
self.bundle.get("features_info_provider"): provider,
|
|
3592
|
-
self.bundle.get("features_info_source"): source,
|
|
3593
|
-
self.bundle.get("features_info_update_frequency"): feature_meta.update_frequency,
|
|
3594
|
-
}
|
|
3595
|
-
)
|
|
3596
|
-
features_info_without_links.append(
|
|
3597
|
-
{
|
|
3598
|
-
self.bundle.get("features_info_name"): internal_feature_name,
|
|
3599
|
-
self.bundle.get("features_info_shap"): self._round_shap_value(feature_meta.shap_value),
|
|
3600
|
-
self.bundle.get("features_info_hitrate"): feature_meta.hit_rate,
|
|
3601
|
-
self.bundle.get("features_info_value_preview"): feature_sample,
|
|
3602
|
-
self.bundle.get("features_info_provider"): internal_provider,
|
|
3603
|
-
self.bundle.get("features_info_source"): internal_source,
|
|
3604
|
-
self.bundle.get("features_info_update_frequency"): feature_meta.update_frequency,
|
|
3605
|
-
}
|
|
3606
|
-
)
|
|
3607
|
-
internal_features_info.append(
|
|
3608
|
-
{
|
|
3609
|
-
self.bundle.get("features_info_name"): internal_feature_name,
|
|
3610
|
-
"feature_link": feature_meta.doc_link,
|
|
3611
|
-
self.bundle.get("features_info_shap"): self._round_shap_value(feature_meta.shap_value),
|
|
3612
|
-
self.bundle.get("features_info_hitrate"): feature_meta.hit_rate,
|
|
3613
|
-
self.bundle.get("features_info_value_preview"): feature_sample,
|
|
3614
|
-
self.bundle.get("features_info_provider"): internal_provider,
|
|
3615
|
-
"provider_link": feature_meta.data_provider_link,
|
|
3616
|
-
self.bundle.get("features_info_source"): internal_source,
|
|
3617
|
-
"source_link": feature_meta.data_source_link,
|
|
3618
|
-
self.bundle.get("features_info_commercial_schema"): feature_meta.commercial_schema or "",
|
|
3619
|
-
self.bundle.get("features_info_update_frequency"): feature_meta.update_frequency,
|
|
3620
|
-
}
|
|
3621
|
-
)
|
|
3571
|
+
feature_info = FeatureInfo.from_metadata(feature_meta, features_df, is_client_feature)
|
|
3572
|
+
features_info.append(feature_info.to_row(self.bundle))
|
|
3573
|
+
features_info_without_links.append(feature_info.to_row_without_links(self.bundle))
|
|
3574
|
+
internal_features_info.append(feature_info.to_internal_row(self.bundle))
|
|
3622
3575
|
|
|
3623
3576
|
if len(features_info) > 0:
|
|
3624
3577
|
self.features_info = pd.DataFrame(features_info)
|
|
@@ -3643,7 +3596,22 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3643
3596
|
autofe_meta = self._search_task.get_autofe_metadata()
|
|
3644
3597
|
if autofe_meta is None:
|
|
3645
3598
|
return None
|
|
3646
|
-
|
|
3599
|
+
if len(self._internal_features_info) != 0:
|
|
3600
|
+
|
|
3601
|
+
def to_feature_meta(row):
|
|
3602
|
+
fm = FeaturesMetadataV2(
|
|
3603
|
+
name=row[bundle.get("features_info_name")],
|
|
3604
|
+
type="",
|
|
3605
|
+
source="",
|
|
3606
|
+
hit_rate=bundle.get("features_info_hitrate"),
|
|
3607
|
+
shap_value=bundle.get("features_info_shap"),
|
|
3608
|
+
data_source=bundle.get("features_info_source"),
|
|
3609
|
+
)
|
|
3610
|
+
return fm
|
|
3611
|
+
|
|
3612
|
+
features_meta = self._internal_features_info.apply(to_feature_meta).to_list()
|
|
3613
|
+
else:
|
|
3614
|
+
features_meta = self._search_task.get_all_features_metadata_v2()
|
|
3647
3615
|
|
|
3648
3616
|
def get_feature_by_name(name: str):
|
|
3649
3617
|
for m in features_meta:
|
|
@@ -882,7 +882,7 @@ class _RestClient:
|
|
|
882
882
|
if content_type:
|
|
883
883
|
headers[_RestClient.CONTENT_TYPE_HEADER_NAME] = content_type
|
|
884
884
|
if trace_id:
|
|
885
|
-
headers[_RestClient.TRACE_ID_HEADER_NAME] = trace_id
|
|
885
|
+
headers[_RestClient.TRACE_ID_HEADER_NAME] = str(trace_id)
|
|
886
886
|
for header_key, header_value in additional_headers.items():
|
|
887
887
|
headers[header_key] = header_value
|
|
888
888
|
return headers
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
from enum import Enum
|
|
4
|
-
from typing import Any, Dict, List, Optional,
|
|
4
|
+
from typing import Any, Dict, List, Optional, Union
|
|
5
5
|
|
|
6
6
|
from pydantic import BaseModel
|
|
7
7
|
|
|
@@ -228,13 +228,13 @@ class FileMetadata(BaseModel):
|
|
|
228
228
|
return c
|
|
229
229
|
return None
|
|
230
230
|
|
|
231
|
-
def search_types(self) ->
|
|
232
|
-
search_keys =
|
|
231
|
+
def search_types(self) -> Dict[SearchKey, str]:
|
|
232
|
+
search_keys = dict()
|
|
233
233
|
for keys_group in self.searchKeys:
|
|
234
234
|
for key in keys_group:
|
|
235
235
|
column = self.column_by_name(key)
|
|
236
236
|
if column:
|
|
237
|
-
search_keys
|
|
237
|
+
search_keys[SearchKey.from_meaning_type(column.meaningType)] = column.name
|
|
238
238
|
return search_keys
|
|
239
239
|
|
|
240
240
|
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
+
from dataclasses import dataclass
|
|
3
4
|
import inspect
|
|
4
5
|
import logging
|
|
5
6
|
import re
|
|
@@ -210,6 +211,21 @@ SUPPORTED_CATBOOST_METRICS = {
|
|
|
210
211
|
}
|
|
211
212
|
|
|
212
213
|
|
|
214
|
+
@dataclass
|
|
215
|
+
class _CrossValResults:
|
|
216
|
+
metric: Optional[float]
|
|
217
|
+
metric_std: Optional[float]
|
|
218
|
+
shap_values: Optional[Dict[str, float]]
|
|
219
|
+
|
|
220
|
+
def get_display_metric(self) -> Optional[str]:
|
|
221
|
+
if self.metric is None:
|
|
222
|
+
return None
|
|
223
|
+
elif self.metric_std is None:
|
|
224
|
+
return f"{self.metric:.3f}"
|
|
225
|
+
else:
|
|
226
|
+
return f"{self.metric:.3f} ± {self.metric_std:.3f}"
|
|
227
|
+
|
|
228
|
+
|
|
213
229
|
class EstimatorWrapper:
|
|
214
230
|
def __init__(
|
|
215
231
|
self,
|
|
@@ -297,11 +313,11 @@ class EstimatorWrapper:
|
|
|
297
313
|
|
|
298
314
|
def cross_val_predict(
|
|
299
315
|
self, x: pd.DataFrame, y: np.ndarray, baseline_score_column: Optional[Any] = None
|
|
300
|
-
) ->
|
|
316
|
+
) -> _CrossValResults:
|
|
301
317
|
x, y, groups, fit_params = self._prepare_to_fit(x, y)
|
|
302
318
|
|
|
303
319
|
if x.shape[1] == 0:
|
|
304
|
-
return None
|
|
320
|
+
return _CrossValResults(metric=None, metric_std=None, shap_values=None)
|
|
305
321
|
|
|
306
322
|
scorer = check_scoring(self.estimator, scoring=self.scorer)
|
|
307
323
|
|
|
@@ -326,7 +342,7 @@ class EstimatorWrapper:
|
|
|
326
342
|
|
|
327
343
|
self.check_fold_metrics(metrics_by_fold)
|
|
328
344
|
|
|
329
|
-
metric =
|
|
345
|
+
metric, metric_std = self._calculate_metric_from_folds(metrics_by_fold)
|
|
330
346
|
|
|
331
347
|
splits = self.cv.split(x, y, groups)
|
|
332
348
|
|
|
@@ -351,7 +367,7 @@ class EstimatorWrapper:
|
|
|
351
367
|
else:
|
|
352
368
|
average_shap_values = None
|
|
353
369
|
|
|
354
|
-
return
|
|
370
|
+
return _CrossValResults(metric=metric, metric_std=metric_std, shap_values=average_shap_values)
|
|
355
371
|
|
|
356
372
|
def process_shap_values(self, shap_values: Dict[str, float]) -> Dict[str, float]:
|
|
357
373
|
return shap_values
|
|
@@ -367,17 +383,25 @@ class EstimatorWrapper:
|
|
|
367
383
|
metric = 2 * metric - 1
|
|
368
384
|
return metric
|
|
369
385
|
|
|
370
|
-
def calculate_metric(
|
|
386
|
+
def calculate_metric(
|
|
387
|
+
self, x: pd.DataFrame, y: np.ndarray, baseline_score_column: Optional[Any] = None
|
|
388
|
+
) -> _CrossValResults:
|
|
371
389
|
x, y, _ = self._prepare_to_calculate(x, y)
|
|
372
390
|
if baseline_score_column is not None and self.metric_name == "GINI":
|
|
373
|
-
metric = roc_auc_score(y, x[baseline_score_column])
|
|
391
|
+
metric, metric_std = roc_auc_score(y, x[baseline_score_column]), None
|
|
374
392
|
else:
|
|
375
393
|
metrics = []
|
|
376
394
|
for est in self.cv_estimators:
|
|
377
395
|
metrics.append(self.scorer(est, x, y))
|
|
378
396
|
|
|
379
|
-
metric =
|
|
380
|
-
return
|
|
397
|
+
metric, metric_std = self._calculate_metric_from_folds(metrics)
|
|
398
|
+
return _CrossValResults(metric=metric, metric_std=metric_std, shap_values=None)
|
|
399
|
+
|
|
400
|
+
def _calculate_metric_from_folds(self, metrics_by_fold: List[float]) -> Tuple[float, float]:
|
|
401
|
+
metrics_by_fold = [self.post_process_metric(m) for m in metrics_by_fold]
|
|
402
|
+
metric = np.mean(metrics_by_fold) * self.multiplier
|
|
403
|
+
metric_std = np.std(metrics_by_fold) * np.abs(self.multiplier)
|
|
404
|
+
return metric, metric_std
|
|
381
405
|
|
|
382
406
|
@staticmethod
|
|
383
407
|
def create(
|
|
@@ -591,7 +615,7 @@ class CatBoostWrapper(EstimatorWrapper):
|
|
|
591
615
|
|
|
592
616
|
def cross_val_predict(
|
|
593
617
|
self, x: pd.DataFrame, y: np.ndarray, baseline_score_column: Optional[Any] = None
|
|
594
|
-
) ->
|
|
618
|
+
) -> _CrossValResults:
|
|
595
619
|
try:
|
|
596
620
|
return super().cross_val_predict(x, y, baseline_score_column)
|
|
597
621
|
except Exception as e:
|
|
@@ -82,7 +82,7 @@ unregistered_only_personal_keys=Only personal search keys used. Api_key from pro
|
|
|
82
82
|
search_key_not_found=Column `{}` from search_keys was not found in X dataframe: {}
|
|
83
83
|
numeric_search_key_not_found=Index {} in search_keys is out of bounds for {} columns of X dataframe
|
|
84
84
|
unsupported_search_key_type=Unsupported type of key in search_keys: {}
|
|
85
|
-
unsupported_type_of_search_key=Unsupported type of search key: {}. It should be a member of
|
|
85
|
+
unsupported_type_of_search_key=Unsupported type of search key: {}. It should be a member of SearchKey
|
|
86
86
|
search_key_country_and_country_code=SearchKey.COUNTRY and country_code parameter were passed simultaniously. Parameter country_code will be ignored
|
|
87
87
|
empty_search_key=Search key {} is empty. Please fill values or remove this search key
|
|
88
88
|
single_constant_search_key=Constant value detected for the {} search key in the X dataframe: {}.\nThat search key will add constant features for different y values.\nPlease add extra search keys with non constant values, like the COUNTRY, POSTAL_CODE, DATE, PHONE NUMBER, EMAIL/HEM or IPv4
|
|
@@ -109,20 +109,63 @@ class DateTimeSearchKeyConverter:
|
|
|
109
109
|
|
|
110
110
|
df = self.clean_old_dates(df)
|
|
111
111
|
|
|
112
|
+
# Define function to apply sine and cosine transformations
|
|
113
|
+
def add_cyclical_features(df, column, period):
|
|
114
|
+
period_suffix = f"_{period}" if column != "day_in_quarter" else ""
|
|
115
|
+
sin_feature = f"datetime_{column}_sin{period_suffix}"
|
|
116
|
+
cos_feature = f"datetime_{column}_cos{period_suffix}"
|
|
117
|
+
if sin_feature not in df.columns:
|
|
118
|
+
df[sin_feature] = np.sin(2 * np.pi * df[column] / period)
|
|
119
|
+
self.generated_features.append(sin_feature)
|
|
120
|
+
if cos_feature not in df.columns:
|
|
121
|
+
df[cos_feature] = np.cos(2 * np.pi * df[column] / period)
|
|
122
|
+
self.generated_features.append(cos_feature)
|
|
123
|
+
|
|
124
|
+
df["quarter"] = df[self.date_column].dt.quarter
|
|
125
|
+
|
|
126
|
+
# Calculate the start date of the quarter for each timestamp
|
|
127
|
+
df["quarter_start"] = df[self.date_column].dt.to_period("Q").dt.start_time
|
|
128
|
+
|
|
129
|
+
# Calculate the day in the quarter
|
|
130
|
+
df["day_in_quarter"] = (df[self.date_column] - df["quarter_start"]).dt.days + 1
|
|
131
|
+
|
|
132
|
+
# Vectorized calculation of days_in_quarter
|
|
133
|
+
quarter = df["quarter"]
|
|
134
|
+
start = df["quarter_start"]
|
|
135
|
+
year = start.dt.year
|
|
136
|
+
month = start.dt.month
|
|
137
|
+
|
|
138
|
+
quarter_end_year = np.where(quarter == 4, year + 1, year)
|
|
139
|
+
quarter_end_month = np.where(quarter == 4, 1, month + 3)
|
|
140
|
+
|
|
141
|
+
end = pd.to_datetime({"year": quarter_end_year, "month": quarter_end_month, "day": 1})
|
|
142
|
+
end.index = df.index
|
|
143
|
+
|
|
144
|
+
df["days_in_quarter"] = (end - start).dt.days
|
|
145
|
+
|
|
146
|
+
add_cyclical_features(df, "day_in_quarter", df["days_in_quarter"]) # Days in the quarter
|
|
147
|
+
|
|
148
|
+
df.drop(columns=["quarter", "quarter_start", "day_in_quarter", "days_in_quarter"], inplace=True)
|
|
149
|
+
|
|
112
150
|
df[seconds] = (df[self.date_column] - df[self.date_column].dt.floor("D")).dt.seconds
|
|
113
151
|
|
|
114
152
|
seconds_without_na = df[seconds].dropna()
|
|
115
153
|
if (seconds_without_na != 0).any() and seconds_without_na.nunique() > 1:
|
|
116
154
|
self.logger.info("Time found in date search key. Add extra features based on time")
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
155
|
+
|
|
156
|
+
# Extract basic components
|
|
157
|
+
df["second"] = df[self.date_column].dt.second
|
|
158
|
+
df["minute"] = df[self.date_column].dt.minute
|
|
159
|
+
df["hour"] = df[self.date_column].dt.hour
|
|
160
|
+
|
|
161
|
+
# Apply cyclical transformations
|
|
162
|
+
add_cyclical_features(df, "second", 60) # Seconds in a minute
|
|
163
|
+
add_cyclical_features(df, "minute", 60) # Minutes in an hour
|
|
164
|
+
add_cyclical_features(df, "minute", 30) # Minutes in half an hour
|
|
165
|
+
add_cyclical_features(df, "hour", 24) # Hours in a day
|
|
166
|
+
|
|
167
|
+
# Drop intermediate columns if not needed
|
|
168
|
+
df.drop(columns=["second", "minute", "hour"], inplace=True)
|
|
126
169
|
|
|
127
170
|
df.drop(columns=seconds, inplace=True)
|
|
128
171
|
|
|
@@ -38,8 +38,9 @@ class EmailDomainGenerator:
|
|
|
38
38
|
def generate(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
39
39
|
for email_col in self.email_columns:
|
|
40
40
|
domain_feature = email_col + self.DOMAIN_SUFFIX
|
|
41
|
-
|
|
42
|
-
|
|
41
|
+
if domain_feature not in df.columns:
|
|
42
|
+
df[domain_feature] = df[email_col].apply(self._email_to_domain).astype("string")
|
|
43
|
+
self.generated_features.append(domain_feature)
|
|
43
44
|
return df
|
|
44
45
|
|
|
45
46
|
@staticmethod
|
|
@@ -0,0 +1,172 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
import itertools
|
|
3
|
+
from typing import Dict, List
|
|
4
|
+
|
|
5
|
+
import numpy as np
|
|
6
|
+
import pandas as pd
|
|
7
|
+
|
|
8
|
+
from upgini.metadata import FeaturesMetadataV2
|
|
9
|
+
from upgini.resource_bundle import ResourceBundle
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
LLM_SOURCE = "LLM with external data augmentation"
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@dataclass
|
|
16
|
+
class FeatureInfo:
|
|
17
|
+
name: str
|
|
18
|
+
internal_name: str
|
|
19
|
+
rounded_shap: float
|
|
20
|
+
hitrate: float
|
|
21
|
+
value_preview: str
|
|
22
|
+
provider: str
|
|
23
|
+
internal_provider: str
|
|
24
|
+
source: str
|
|
25
|
+
internal_source: str
|
|
26
|
+
update_frequency: str
|
|
27
|
+
commercial_schema: str
|
|
28
|
+
doc_link: str
|
|
29
|
+
data_provider_link: str
|
|
30
|
+
data_source_link: str
|
|
31
|
+
|
|
32
|
+
@staticmethod
|
|
33
|
+
def from_metadata(feature_meta: FeaturesMetadataV2, data: pd.DataFrame, is_client_feature: bool) -> "FeatureInfo":
|
|
34
|
+
return FeatureInfo(
|
|
35
|
+
name=_get_name(feature_meta),
|
|
36
|
+
internal_name=_get_internal_name(feature_meta),
|
|
37
|
+
rounded_shap=_round_shap_value(feature_meta.shap_value),
|
|
38
|
+
hitrate=feature_meta.hit_rate,
|
|
39
|
+
value_preview=_get_feature_sample(feature_meta, data),
|
|
40
|
+
provider=_get_provider(feature_meta, is_client_feature),
|
|
41
|
+
internal_provider=_get_internal_provider(feature_meta, is_client_feature),
|
|
42
|
+
source=_get_source(feature_meta, is_client_feature),
|
|
43
|
+
internal_source=_get_internal_source(feature_meta, is_client_feature),
|
|
44
|
+
update_frequency=feature_meta.update_frequency,
|
|
45
|
+
commercial_schema=feature_meta.commercial_schema,
|
|
46
|
+
doc_link=feature_meta.doc_link,
|
|
47
|
+
data_provider_link=feature_meta.data_provider_link,
|
|
48
|
+
data_source_link=feature_meta.data_source_link,
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
def to_row(self, bundle: ResourceBundle) -> Dict[str, str]:
|
|
52
|
+
return {
|
|
53
|
+
bundle.get("features_info_name"): self.name,
|
|
54
|
+
bundle.get("features_info_shap"): self.rounded_shap,
|
|
55
|
+
bundle.get("features_info_hitrate"): self.hitrate,
|
|
56
|
+
bundle.get("features_info_value_preview"): self.value_preview,
|
|
57
|
+
bundle.get("features_info_provider"): self.provider,
|
|
58
|
+
bundle.get("features_info_source"): self.source,
|
|
59
|
+
bundle.get("features_info_update_frequency"): self.update_frequency,
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
def to_row_without_links(self, bundle: ResourceBundle) -> Dict[str, str]:
|
|
63
|
+
return {
|
|
64
|
+
bundle.get("features_info_name"): self.internal_name,
|
|
65
|
+
bundle.get("features_info_shap"): self.rounded_shap,
|
|
66
|
+
bundle.get("features_info_hitrate"): self.hitrate,
|
|
67
|
+
bundle.get("features_info_value_preview"): self.value_preview,
|
|
68
|
+
bundle.get("features_info_provider"): self.internal_provider,
|
|
69
|
+
bundle.get("features_info_source"): self.internal_source,
|
|
70
|
+
bundle.get("features_info_update_frequency"): self.update_frequency,
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
def to_internal_row(self, bundle: ResourceBundle) -> Dict[str, str]:
|
|
74
|
+
return {
|
|
75
|
+
bundle.get("features_info_name"): self.internal_name,
|
|
76
|
+
"feature_link": self.doc_link,
|
|
77
|
+
bundle.get("features_info_shap"): self.rounded_shap,
|
|
78
|
+
bundle.get("features_info_hitrate"): self.hitrate,
|
|
79
|
+
bundle.get("features_info_value_preview"): self.value_preview,
|
|
80
|
+
bundle.get("features_info_provider"): self.internal_provider,
|
|
81
|
+
"provider_link": self.data_provider_link,
|
|
82
|
+
bundle.get("features_info_source"): self.internal_source,
|
|
83
|
+
"source_link": self.data_source_link,
|
|
84
|
+
bundle.get("features_info_commercial_schema"): self.commercial_schema or "",
|
|
85
|
+
bundle.get("features_info_update_frequency"): self.update_frequency,
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def _get_feature_sample(feature_meta: FeaturesMetadataV2, data: pd.DataFrame) -> str:
|
|
90
|
+
if feature_meta.name in data.columns:
|
|
91
|
+
feature_sample = np.random.choice(data[feature_meta.name].dropna().unique(), 3).tolist()
|
|
92
|
+
if len(feature_sample) > 0 and isinstance(feature_sample[0], float):
|
|
93
|
+
feature_sample = [round(f, 4) for f in feature_sample]
|
|
94
|
+
feature_sample = [str(f) for f in feature_sample]
|
|
95
|
+
feature_sample = ", ".join(feature_sample)
|
|
96
|
+
if len(feature_sample) > 30:
|
|
97
|
+
feature_sample = feature_sample[:30] + "..."
|
|
98
|
+
else:
|
|
99
|
+
feature_sample = ""
|
|
100
|
+
return feature_sample
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def _get_name(feature_meta: FeaturesMetadataV2) -> str:
|
|
104
|
+
if feature_meta.doc_link:
|
|
105
|
+
return _to_anchor(feature_meta.doc_link, feature_meta.name)
|
|
106
|
+
else:
|
|
107
|
+
return feature_meta.name
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def _get_internal_name(feature_meta: FeaturesMetadataV2) -> str:
|
|
111
|
+
return feature_meta.name
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def _get_provider(feature_meta: FeaturesMetadataV2, is_client_feature: bool) -> str:
|
|
115
|
+
providers = _list_or_single(feature_meta.data_providers, feature_meta.data_provider)
|
|
116
|
+
provider_links = _list_or_single(feature_meta.data_provider_links, feature_meta.data_provider_link)
|
|
117
|
+
if providers:
|
|
118
|
+
provider = _make_links(providers, provider_links)
|
|
119
|
+
else:
|
|
120
|
+
provider = "" if is_client_feature else _to_anchor("https://upgini.com", "Upgini")
|
|
121
|
+
return provider
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def _get_internal_provider(feature_meta: FeaturesMetadataV2, is_client_feature: bool) -> str:
|
|
125
|
+
return "" if is_client_feature else (feature_meta.data_provider or "Upgini")
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def _get_source(feature_meta: FeaturesMetadataV2, is_client_feature: bool) -> str:
|
|
129
|
+
sources = _list_or_single(feature_meta.data_sources, feature_meta.data_source)
|
|
130
|
+
source_links = _list_or_single(feature_meta.data_source_links, feature_meta.data_source_link)
|
|
131
|
+
if sources:
|
|
132
|
+
source = _make_links(sources, source_links)
|
|
133
|
+
else:
|
|
134
|
+
source = _get_internal_source(feature_meta, is_client_feature)
|
|
135
|
+
return source
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
def _get_internal_source(feature_meta: FeaturesMetadataV2, is_client_feature: bool) -> str:
|
|
139
|
+
return feature_meta.data_source or (
|
|
140
|
+
LLM_SOURCE
|
|
141
|
+
if not feature_meta.name.endswith("_country")
|
|
142
|
+
and not feature_meta.name.endswith("_postal_code")
|
|
143
|
+
and not is_client_feature
|
|
144
|
+
else ""
|
|
145
|
+
)
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def _list_or_single(lst: List[str], single: str):
|
|
149
|
+
return lst or ([single] if single else [])
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
def _to_anchor(link: str, value: str) -> str:
|
|
153
|
+
if not value:
|
|
154
|
+
return ""
|
|
155
|
+
elif not link:
|
|
156
|
+
return value
|
|
157
|
+
elif value == LLM_SOURCE:
|
|
158
|
+
return value
|
|
159
|
+
else:
|
|
160
|
+
return f"<a href='{link}' target='_blank' rel='noopener noreferrer'>{value}</a>"
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
def _make_links(names: List[str], links: List[str]):
|
|
164
|
+
all_links = [_to_anchor(link, name) for name, link in itertools.zip_longest(names, links)]
|
|
165
|
+
return ",".join(all_links)
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
def _round_shap_value(shap: float) -> float:
|
|
169
|
+
if shap > 0.0 and shap < 0.0001:
|
|
170
|
+
return 0.0001
|
|
171
|
+
else:
|
|
172
|
+
return round(shap, 4)
|
|
@@ -2,6 +2,7 @@ import logging
|
|
|
2
2
|
from logging import Logger
|
|
3
3
|
from typing import Dict, List, Optional, Tuple
|
|
4
4
|
|
|
5
|
+
import numpy as np
|
|
5
6
|
import pandas as pd
|
|
6
7
|
from pandas.api.types import is_integer_dtype, is_object_dtype, is_string_dtype
|
|
7
8
|
|
|
@@ -83,10 +84,21 @@ class FeaturesValidator:
|
|
|
83
84
|
return [
|
|
84
85
|
i
|
|
85
86
|
for i in df
|
|
86
|
-
if (is_object_dtype(df[i]) or is_string_dtype(df[i]) or
|
|
87
|
+
if (is_object_dtype(df[i]) or is_string_dtype(df[i]) or FeaturesValidator.__is_integer(df[i]))
|
|
87
88
|
and (df[i].nunique(dropna=False) / row_count >= 0.85)
|
|
88
89
|
]
|
|
89
90
|
|
|
91
|
+
@staticmethod
|
|
92
|
+
def __is_integer(series: pd.Series) -> bool:
|
|
93
|
+
return (
|
|
94
|
+
is_integer_dtype(series)
|
|
95
|
+
or series.dropna()
|
|
96
|
+
.apply(
|
|
97
|
+
lambda f: (float.is_integer(f) and abs(f) < np.iinfo(np.int64).max) if isinstance(f, float) else False
|
|
98
|
+
)
|
|
99
|
+
.all()
|
|
100
|
+
)
|
|
101
|
+
|
|
90
102
|
@staticmethod
|
|
91
103
|
def find_constant_features(df: pd.DataFrame) -> List[str]:
|
|
92
104
|
return [i for i in df if df[i].nunique() <= 1]
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
__version__ = "1.2.28"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|