upgini 1.2.29__tar.gz → 1.2.29a1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- {upgini-1.2.29 → upgini-1.2.29a1}/PKG-INFO +1 -1
- upgini-1.2.29a1/src/upgini/__about__.py +1 -0
- {upgini-1.2.29 → upgini-1.2.29a1}/src/upgini/features_enricher.py +130 -98
- {upgini-1.2.29 → upgini-1.2.29a1}/src/upgini/http.py +1 -1
- {upgini-1.2.29 → upgini-1.2.29a1}/src/upgini/metadata.py +4 -4
- {upgini-1.2.29 → upgini-1.2.29a1}/src/upgini/metrics.py +9 -33
- {upgini-1.2.29 → upgini-1.2.29a1}/src/upgini/utils/datetime_utils.py +35 -44
- {upgini-1.2.29 → upgini-1.2.29a1}/src/upgini/utils/email_utils.py +2 -3
- {upgini-1.2.29 → upgini-1.2.29a1}/src/upgini/utils/features_validator.py +1 -13
- upgini-1.2.29/src/upgini/__about__.py +0 -1
- upgini-1.2.29/src/upgini/utils/feature_info.py +0 -172
- {upgini-1.2.29 → upgini-1.2.29a1}/.gitignore +0 -0
- {upgini-1.2.29 → upgini-1.2.29a1}/LICENSE +0 -0
- {upgini-1.2.29 → upgini-1.2.29a1}/README.md +0 -0
- {upgini-1.2.29 → upgini-1.2.29a1}/pyproject.toml +0 -0
- {upgini-1.2.29 → upgini-1.2.29a1}/src/upgini/__init__.py +0 -0
- {upgini-1.2.29 → upgini-1.2.29a1}/src/upgini/ads.py +0 -0
- {upgini-1.2.29 → upgini-1.2.29a1}/src/upgini/ads_management/__init__.py +0 -0
- {upgini-1.2.29 → upgini-1.2.29a1}/src/upgini/ads_management/ads_manager.py +0 -0
- {upgini-1.2.29 → upgini-1.2.29a1}/src/upgini/autofe/__init__.py +0 -0
- {upgini-1.2.29 → upgini-1.2.29a1}/src/upgini/autofe/all_operands.py +0 -0
- {upgini-1.2.29 → upgini-1.2.29a1}/src/upgini/autofe/binary.py +0 -0
- {upgini-1.2.29 → upgini-1.2.29a1}/src/upgini/autofe/date.py +0 -0
- {upgini-1.2.29 → upgini-1.2.29a1}/src/upgini/autofe/feature.py +0 -0
- {upgini-1.2.29 → upgini-1.2.29a1}/src/upgini/autofe/groupby.py +0 -0
- {upgini-1.2.29 → upgini-1.2.29a1}/src/upgini/autofe/operand.py +0 -0
- {upgini-1.2.29 → upgini-1.2.29a1}/src/upgini/autofe/unary.py +0 -0
- {upgini-1.2.29 → upgini-1.2.29a1}/src/upgini/autofe/vector.py +0 -0
- {upgini-1.2.29 → upgini-1.2.29a1}/src/upgini/data_source/__init__.py +0 -0
- {upgini-1.2.29 → upgini-1.2.29a1}/src/upgini/data_source/data_source_publisher.py +0 -0
- {upgini-1.2.29 → upgini-1.2.29a1}/src/upgini/dataset.py +0 -0
- {upgini-1.2.29 → upgini-1.2.29a1}/src/upgini/errors.py +0 -0
- {upgini-1.2.29 → upgini-1.2.29a1}/src/upgini/lazy_import.py +0 -0
- {upgini-1.2.29 → upgini-1.2.29a1}/src/upgini/mdc/__init__.py +0 -0
- {upgini-1.2.29 → upgini-1.2.29a1}/src/upgini/mdc/context.py +0 -0
- {upgini-1.2.29 → upgini-1.2.29a1}/src/upgini/normalizer/__init__.py +0 -0
- {upgini-1.2.29 → upgini-1.2.29a1}/src/upgini/normalizer/normalize_utils.py +0 -0
- {upgini-1.2.29 → upgini-1.2.29a1}/src/upgini/resource_bundle/__init__.py +0 -0
- {upgini-1.2.29 → upgini-1.2.29a1}/src/upgini/resource_bundle/exceptions.py +0 -0
- {upgini-1.2.29 → upgini-1.2.29a1}/src/upgini/resource_bundle/strings.properties +0 -0
- {upgini-1.2.29 → upgini-1.2.29a1}/src/upgini/resource_bundle/strings_widget.properties +0 -0
- {upgini-1.2.29 → upgini-1.2.29a1}/src/upgini/sampler/__init__.py +0 -0
- {upgini-1.2.29 → upgini-1.2.29a1}/src/upgini/sampler/base.py +0 -0
- {upgini-1.2.29 → upgini-1.2.29a1}/src/upgini/sampler/random_under_sampler.py +0 -0
- {upgini-1.2.29 → upgini-1.2.29a1}/src/upgini/sampler/utils.py +0 -0
- {upgini-1.2.29 → upgini-1.2.29a1}/src/upgini/search_task.py +0 -0
- {upgini-1.2.29 → upgini-1.2.29a1}/src/upgini/spinner.py +0 -0
- {upgini-1.2.29 → upgini-1.2.29a1}/src/upgini/utils/Roboto-Regular.ttf +0 -0
- {upgini-1.2.29 → upgini-1.2.29a1}/src/upgini/utils/__init__.py +0 -0
- {upgini-1.2.29 → upgini-1.2.29a1}/src/upgini/utils/base_search_key_detector.py +0 -0
- {upgini-1.2.29 → upgini-1.2.29a1}/src/upgini/utils/blocked_time_series.py +0 -0
- {upgini-1.2.29 → upgini-1.2.29a1}/src/upgini/utils/country_utils.py +0 -0
- {upgini-1.2.29 → upgini-1.2.29a1}/src/upgini/utils/custom_loss_utils.py +0 -0
- {upgini-1.2.29 → upgini-1.2.29a1}/src/upgini/utils/cv_utils.py +0 -0
- {upgini-1.2.29 → upgini-1.2.29a1}/src/upgini/utils/deduplicate_utils.py +0 -0
- {upgini-1.2.29 → upgini-1.2.29a1}/src/upgini/utils/display_utils.py +0 -0
- {upgini-1.2.29 → upgini-1.2.29a1}/src/upgini/utils/fallback_progress_bar.py +0 -0
- {upgini-1.2.29 → upgini-1.2.29a1}/src/upgini/utils/format.py +0 -0
- {upgini-1.2.29 → upgini-1.2.29a1}/src/upgini/utils/ip_utils.py +0 -0
- {upgini-1.2.29 → upgini-1.2.29a1}/src/upgini/utils/phone_utils.py +0 -0
- {upgini-1.2.29 → upgini-1.2.29a1}/src/upgini/utils/postal_code_utils.py +0 -0
- {upgini-1.2.29 → upgini-1.2.29a1}/src/upgini/utils/progress_bar.py +0 -0
- {upgini-1.2.29 → upgini-1.2.29a1}/src/upgini/utils/sklearn_ext.py +0 -0
- {upgini-1.2.29 → upgini-1.2.29a1}/src/upgini/utils/target_utils.py +0 -0
- {upgini-1.2.29 → upgini-1.2.29a1}/src/upgini/utils/track_info.py +0 -0
- {upgini-1.2.29 → upgini-1.2.29a1}/src/upgini/utils/warning_counter.py +0 -0
- {upgini-1.2.29 → upgini-1.2.29a1}/src/upgini/version_validator.py +0 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "1.2.29a1"
|
|
@@ -54,7 +54,6 @@ from upgini.metadata import (
|
|
|
54
54
|
SYSTEM_RECORD_ID,
|
|
55
55
|
TARGET,
|
|
56
56
|
CVType,
|
|
57
|
-
FeaturesMetadataV2,
|
|
58
57
|
FileColumnMeaningType,
|
|
59
58
|
ModelTaskType,
|
|
60
59
|
RuntimeParameters,
|
|
@@ -96,7 +95,6 @@ from upgini.utils.email_utils import (
|
|
|
96
95
|
EmailSearchKeyConverter,
|
|
97
96
|
EmailSearchKeyDetector,
|
|
98
97
|
)
|
|
99
|
-
from upgini.utils.feature_info import FeatureInfo, _round_shap_value
|
|
100
98
|
from upgini.utils.features_validator import FeaturesValidator
|
|
101
99
|
from upgini.utils.format import Format
|
|
102
100
|
from upgini.utils.ip_utils import IpSearchKeyConverter
|
|
@@ -160,10 +158,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
160
158
|
|
|
161
159
|
shared_datasets: list of str, optional (default=None)
|
|
162
160
|
List of private shared dataset ids for custom search
|
|
163
|
-
|
|
164
|
-
select_features: bool, optional (default=False)
|
|
165
|
-
If True, return only selected features both from input and data sources.
|
|
166
|
-
Otherwise, return all features from input and only selected features from data sources.
|
|
167
161
|
"""
|
|
168
162
|
|
|
169
163
|
TARGET_NAME = "target"
|
|
@@ -230,7 +224,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
230
224
|
client_visitorid: Optional[str] = None,
|
|
231
225
|
custom_bundle_config: Optional[str] = None,
|
|
232
226
|
add_date_if_missing: bool = True,
|
|
233
|
-
select_features: bool = False,
|
|
234
227
|
**kwargs,
|
|
235
228
|
):
|
|
236
229
|
self.bundle = get_custom_bundle(custom_bundle_config)
|
|
@@ -284,11 +277,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
284
277
|
self._relevant_data_sources_wo_links: pd.DataFrame = self.EMPTY_DATA_SOURCES
|
|
285
278
|
self.metrics: Optional[pd.DataFrame] = None
|
|
286
279
|
self.feature_names_ = []
|
|
287
|
-
self.dropped_client_feature_names_ = []
|
|
288
280
|
self.feature_importances_ = []
|
|
289
281
|
self.search_id = search_id
|
|
290
|
-
self.select_features = select_features
|
|
291
|
-
|
|
292
282
|
if search_id:
|
|
293
283
|
search_task = SearchTask(search_id, rest_client=self.rest_client, logger=self.logger)
|
|
294
284
|
|
|
@@ -1009,10 +999,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1009
999
|
text_features=self.generate_features,
|
|
1010
1000
|
has_date=has_date,
|
|
1011
1001
|
)
|
|
1012
|
-
|
|
1002
|
+
etalon_metric, _ = baseline_estimator.cross_val_predict(
|
|
1013
1003
|
fitting_X, y_sorted, self.baseline_score_column
|
|
1014
1004
|
)
|
|
1015
|
-
etalon_metric = etalon_cv_result.get_display_metric()
|
|
1016
1005
|
if etalon_metric is None:
|
|
1017
1006
|
self.logger.info(
|
|
1018
1007
|
f"Baseline {metric} on train client features is None (maybe all features was removed)"
|
|
@@ -1044,9 +1033,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1044
1033
|
text_features=self.generate_features,
|
|
1045
1034
|
has_date=has_date,
|
|
1046
1035
|
)
|
|
1047
|
-
|
|
1048
|
-
|
|
1049
|
-
|
|
1036
|
+
enriched_metric, enriched_shaps = enriched_estimator.cross_val_predict(
|
|
1037
|
+
fitting_enriched_X, enriched_y_sorted
|
|
1038
|
+
)
|
|
1050
1039
|
|
|
1051
1040
|
if enriched_shaps is not None:
|
|
1052
1041
|
self._update_shap_values(enriched_shaps)
|
|
@@ -1059,7 +1048,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1059
1048
|
else:
|
|
1060
1049
|
self.logger.info(f"Enriched {metric} on train combined features: {enriched_metric}")
|
|
1061
1050
|
if etalon_metric is not None and enriched_metric is not None:
|
|
1062
|
-
uplift = (
|
|
1051
|
+
uplift = (enriched_metric - etalon_metric) * multiplier
|
|
1063
1052
|
|
|
1064
1053
|
train_metrics = {
|
|
1065
1054
|
self.bundle.get("quality_metrics_segment_header"): self.bundle.get(
|
|
@@ -1102,10 +1091,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1102
1091
|
f"Calculate baseline {metric} on eval set {idx + 1} "
|
|
1103
1092
|
f"on client features: {eval_X_sorted.columns.to_list()}"
|
|
1104
1093
|
)
|
|
1105
|
-
|
|
1094
|
+
etalon_eval_metric = baseline_estimator.calculate_metric(
|
|
1106
1095
|
eval_X_sorted, eval_y_sorted, self.baseline_score_column
|
|
1107
1096
|
)
|
|
1108
|
-
etalon_eval_metric = etalon_eval_results.get_display_metric()
|
|
1109
1097
|
self.logger.info(
|
|
1110
1098
|
f"Baseline {metric} on eval set {idx + 1} client features: {etalon_eval_metric}"
|
|
1111
1099
|
)
|
|
@@ -1117,10 +1105,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1117
1105
|
f"Calculate enriched {metric} on eval set {idx + 1} "
|
|
1118
1106
|
f"on combined features: {enriched_eval_X_sorted.columns.to_list()}"
|
|
1119
1107
|
)
|
|
1120
|
-
|
|
1108
|
+
enriched_eval_metric = enriched_estimator.calculate_metric(
|
|
1121
1109
|
enriched_eval_X_sorted, enriched_eval_y_sorted
|
|
1122
1110
|
)
|
|
1123
|
-
enriched_eval_metric = enriched_eval_results.get_display_metric()
|
|
1124
1111
|
self.logger.info(
|
|
1125
1112
|
f"Enriched {metric} on eval set {idx + 1} combined features: {enriched_eval_metric}"
|
|
1126
1113
|
)
|
|
@@ -1128,7 +1115,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1128
1115
|
enriched_eval_metric = None
|
|
1129
1116
|
|
|
1130
1117
|
if etalon_eval_metric is not None and enriched_eval_metric is not None:
|
|
1131
|
-
eval_uplift = (
|
|
1118
|
+
eval_uplift = (enriched_eval_metric - etalon_eval_metric) * multiplier
|
|
1132
1119
|
else:
|
|
1133
1120
|
eval_uplift = None
|
|
1134
1121
|
|
|
@@ -1211,7 +1198,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1211
1198
|
|
|
1212
1199
|
def _update_shap_values(self, new_shaps: Dict[str, float]):
|
|
1213
1200
|
new_shaps = {
|
|
1214
|
-
feature: _round_shap_value(shap)
|
|
1201
|
+
feature: self._round_shap_value(shap)
|
|
1202
|
+
for feature, shap in new_shaps.items()
|
|
1203
|
+
if feature in self.feature_names_
|
|
1215
1204
|
}
|
|
1216
1205
|
features_importances = list(new_shaps.items())
|
|
1217
1206
|
features_importances.sort(key=lambda m: (-m[1], m[0]))
|
|
@@ -1260,7 +1249,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1260
1249
|
display_html_dataframe(
|
|
1261
1250
|
self.relevant_data_sources,
|
|
1262
1251
|
self._relevant_data_sources_wo_links,
|
|
1263
|
-
self.bundle.get("
|
|
1252
|
+
self.bundle.get("relevant_features_header"),
|
|
1264
1253
|
display_handle=self.data_sources_display_handle,
|
|
1265
1254
|
)
|
|
1266
1255
|
except (ImportError, NameError):
|
|
@@ -1448,12 +1437,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1448
1437
|
client_features = [
|
|
1449
1438
|
c
|
|
1450
1439
|
for c in X_sampled.columns.to_list()
|
|
1451
|
-
if
|
|
1452
|
-
not self.select_features
|
|
1453
|
-
or c in self.feature_names_
|
|
1454
|
-
or (self.fit_columns_renaming is not None and self.fit_columns_renaming.get(c) in self.feature_names_)
|
|
1455
|
-
)
|
|
1456
|
-
and c
|
|
1440
|
+
if c
|
|
1457
1441
|
not in (
|
|
1458
1442
|
excluding_search_keys
|
|
1459
1443
|
+ list(self.fit_dropped_features)
|
|
@@ -1669,10 +1653,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1669
1653
|
generated_features = []
|
|
1670
1654
|
if date_column is not None:
|
|
1671
1655
|
converter = DateTimeSearchKeyConverter(date_column, self.date_format, self.logger, self.bundle)
|
|
1672
|
-
|
|
1673
|
-
df_with_date_features = converter.convert(df, keep_time=True)
|
|
1674
|
-
df_with_date_features[date_column] = df[date_column]
|
|
1675
|
-
df = df_with_date_features
|
|
1656
|
+
df = converter.convert(df, keep_time=True)
|
|
1676
1657
|
generated_features = converter.generated_features
|
|
1677
1658
|
|
|
1678
1659
|
email_columns = SearchKey.find_all_keys(search_keys, SearchKey.EMAIL)
|
|
@@ -1681,10 +1662,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1681
1662
|
df = generator.generate(df)
|
|
1682
1663
|
generated_features.extend(generator.generated_features)
|
|
1683
1664
|
|
|
1684
|
-
|
|
1685
|
-
|
|
1686
|
-
|
|
1687
|
-
columns_renaming = {c: c for c in df.columns}
|
|
1665
|
+
normalizer = Normalizer(self.bundle, self.logger)
|
|
1666
|
+
df, search_keys, generated_features = normalizer.normalize(df, search_keys, generated_features)
|
|
1667
|
+
columns_renaming = normalizer.columns_renaming
|
|
1688
1668
|
|
|
1689
1669
|
df, _ = clean_full_duplicates(df, logger=self.logger, bundle=self.bundle)
|
|
1690
1670
|
|
|
@@ -2000,19 +1980,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2000
1980
|
file_metadata = self._search_task.get_file_metadata(str(uuid.uuid4()))
|
|
2001
1981
|
search_keys = file_metadata.search_types()
|
|
2002
1982
|
if SearchKey.IPV6_ADDRESS in search_keys:
|
|
2003
|
-
|
|
2004
|
-
search_keys.pop(SearchKey.IPV6_ADDRESS, None)
|
|
1983
|
+
search_keys.remove(SearchKey.IPV6_ADDRESS)
|
|
2005
1984
|
|
|
2006
|
-
keys = (
|
|
2007
|
-
"{"
|
|
2008
|
-
+ ", ".join(
|
|
2009
|
-
[
|
|
2010
|
-
f'"{key.name}": {{"name": "{name}", "value": "{key_example(key)}"}}'
|
|
2011
|
-
for key, name in search_keys.items()
|
|
2012
|
-
]
|
|
2013
|
-
)
|
|
2014
|
-
+ "}"
|
|
2015
|
-
)
|
|
1985
|
+
keys = "{" + ", ".join([f'"{key.name}": "{key_example(key)}"' for key in search_keys]) + "}"
|
|
2016
1986
|
features_for_transform = self._search_task.get_features_for_transform()
|
|
2017
1987
|
if features_for_transform:
|
|
2018
1988
|
original_features_for_transform = [
|
|
@@ -2093,9 +2063,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2093
2063
|
|
|
2094
2064
|
is_demo_dataset = hash_input(validated_X) in DEMO_DATASET_HASHES
|
|
2095
2065
|
|
|
2096
|
-
columns_to_drop = [
|
|
2097
|
-
c for c in validated_X.columns if c in self.feature_names_ and c in self.dropped_client_feature_names_
|
|
2098
|
-
]
|
|
2066
|
+
columns_to_drop = [c for c in validated_X.columns if c in self.feature_names_]
|
|
2099
2067
|
if len(columns_to_drop) > 0:
|
|
2100
2068
|
msg = self.bundle.get("x_contains_enriching_columns").format(columns_to_drop)
|
|
2101
2069
|
self.logger.warning(msg)
|
|
@@ -2124,7 +2092,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2124
2092
|
date_column = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
|
|
2125
2093
|
if date_column is not None:
|
|
2126
2094
|
converter = DateTimeSearchKeyConverter(date_column, self.date_format, self.logger, bundle=self.bundle)
|
|
2127
|
-
df = converter.convert(df
|
|
2095
|
+
df = converter.convert(df)
|
|
2128
2096
|
self.logger.info(f"Date column after convertion: {df[date_column]}")
|
|
2129
2097
|
generated_features.extend(converter.generated_features)
|
|
2130
2098
|
else:
|
|
@@ -2219,12 +2187,11 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2219
2187
|
|
|
2220
2188
|
if add_fit_system_record_id:
|
|
2221
2189
|
df = self.__add_fit_system_record_id(df, search_keys, SYSTEM_RECORD_ID)
|
|
2190
|
+
if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
|
|
2191
|
+
df = df.drop(columns=DateTimeSearchKeyConverter.DATETIME_COL)
|
|
2222
2192
|
df = df.rename(columns={SYSTEM_RECORD_ID: SORT_ID})
|
|
2223
2193
|
features_not_to_pass.append(SORT_ID)
|
|
2224
2194
|
|
|
2225
|
-
if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
|
|
2226
|
-
df = df.drop(columns=DateTimeSearchKeyConverter.DATETIME_COL)
|
|
2227
|
-
|
|
2228
2195
|
# search keys might be changed after explode
|
|
2229
2196
|
columns_for_system_record_id = sorted(list(search_keys.keys()) + features_for_transform)
|
|
2230
2197
|
df[SYSTEM_RECORD_ID] = pd.util.hash_pandas_object(df[columns_for_system_record_id], index=False).astype(
|
|
@@ -2243,7 +2210,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2243
2210
|
|
|
2244
2211
|
combined_search_keys = combine_search_keys(search_keys.keys())
|
|
2245
2212
|
|
|
2246
|
-
df_without_features = df.drop(columns=features_not_to_pass
|
|
2213
|
+
df_without_features = df.drop(columns=features_not_to_pass)
|
|
2247
2214
|
|
|
2248
2215
|
df_without_features, full_duplicates_warning = clean_full_duplicates(
|
|
2249
2216
|
df_without_features, self.logger, bundle=self.bundle
|
|
@@ -2352,15 +2319,11 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2352
2319
|
else:
|
|
2353
2320
|
result = enrich()
|
|
2354
2321
|
|
|
2355
|
-
selecting_columns = [
|
|
2356
|
-
c
|
|
2357
|
-
for c in itertools.chain(validated_X.columns.tolist(), generated_features)
|
|
2358
|
-
if c not in self.dropped_client_feature_names_
|
|
2359
|
-
]
|
|
2360
2322
|
filtered_columns = self.__filtered_enriched_features(importance_threshold, max_features)
|
|
2361
|
-
|
|
2323
|
+
existing_filtered_columns = [
|
|
2362
2324
|
c for c in filtered_columns if c in result.columns and c not in validated_X.columns
|
|
2363
|
-
|
|
2325
|
+
]
|
|
2326
|
+
selecting_columns = validated_X.columns.tolist() + generated_features + existing_filtered_columns
|
|
2364
2327
|
if add_fit_system_record_id:
|
|
2365
2328
|
selecting_columns.append(SORT_ID)
|
|
2366
2329
|
|
|
@@ -3527,7 +3490,15 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3527
3490
|
|
|
3528
3491
|
return result_train, result_eval_sets
|
|
3529
3492
|
|
|
3493
|
+
@staticmethod
|
|
3494
|
+
def _round_shap_value(shap: float) -> float:
|
|
3495
|
+
if shap > 0.0 and shap < 0.0001:
|
|
3496
|
+
return 0.0001
|
|
3497
|
+
else:
|
|
3498
|
+
return round(shap, 4)
|
|
3499
|
+
|
|
3530
3500
|
def __prepare_feature_importances(self, trace_id: str, x_columns: List[str], silent=False):
|
|
3501
|
+
llm_source = "LLM with external data augmentation"
|
|
3531
3502
|
if self._search_task is None:
|
|
3532
3503
|
raise NotFittedError(self.bundle.get("transform_unfitted_enricher"))
|
|
3533
3504
|
features_meta = self._search_task.get_all_features_metadata_v2()
|
|
@@ -3538,40 +3509,116 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3538
3509
|
features_df = self._search_task.get_all_initial_raw_features(trace_id, metrics_calculation=True)
|
|
3539
3510
|
|
|
3540
3511
|
self.feature_names_ = []
|
|
3541
|
-
self.dropped_client_feature_names_ = []
|
|
3542
3512
|
self.feature_importances_ = []
|
|
3543
3513
|
features_info = []
|
|
3544
3514
|
features_info_without_links = []
|
|
3545
3515
|
internal_features_info = []
|
|
3546
3516
|
|
|
3517
|
+
def list_or_single(lst: List[str], single: str):
|
|
3518
|
+
return lst or ([single] if single else [])
|
|
3519
|
+
|
|
3520
|
+
def to_anchor(link: str, value: str) -> str:
|
|
3521
|
+
if not value:
|
|
3522
|
+
return ""
|
|
3523
|
+
elif not link:
|
|
3524
|
+
return value
|
|
3525
|
+
elif value == llm_source:
|
|
3526
|
+
return value
|
|
3527
|
+
else:
|
|
3528
|
+
return f"<a href='{link}' target='_blank' rel='noopener noreferrer'>{value}</a>"
|
|
3529
|
+
|
|
3530
|
+
def make_links(names: List[str], links: List[str]):
|
|
3531
|
+
all_links = [to_anchor(link, name) for name, link in itertools.zip_longest(names, links)]
|
|
3532
|
+
return ",".join(all_links)
|
|
3533
|
+
|
|
3547
3534
|
features_meta.sort(key=lambda m: (-m.shap_value, m.name))
|
|
3548
3535
|
for feature_meta in features_meta:
|
|
3549
3536
|
if feature_meta.name in original_names_dict.keys():
|
|
3550
3537
|
feature_meta.name = original_names_dict[feature_meta.name]
|
|
3551
|
-
|
|
3552
|
-
is_client_feature = feature_meta.name in x_columns
|
|
3553
|
-
|
|
3554
|
-
if feature_meta.shap_value == 0.0:
|
|
3555
|
-
if self.select_features:
|
|
3556
|
-
self.dropped_client_feature_names_.append(feature_meta.name)
|
|
3557
|
-
continue
|
|
3558
|
-
|
|
3559
|
-
# Use only important features
|
|
3538
|
+
# Use only enriched features
|
|
3560
3539
|
if (
|
|
3561
|
-
feature_meta.name in
|
|
3540
|
+
feature_meta.name in x_columns
|
|
3562
3541
|
or feature_meta.name == COUNTRY
|
|
3563
|
-
|
|
3564
|
-
or
|
|
3542
|
+
or feature_meta.shap_value == 0.0
|
|
3543
|
+
or feature_meta.name in self.fit_generated_features
|
|
3565
3544
|
):
|
|
3566
3545
|
continue
|
|
3567
3546
|
|
|
3547
|
+
feature_sample = []
|
|
3568
3548
|
self.feature_names_.append(feature_meta.name)
|
|
3569
|
-
self.feature_importances_.append(_round_shap_value(feature_meta.shap_value))
|
|
3549
|
+
self.feature_importances_.append(self._round_shap_value(feature_meta.shap_value))
|
|
3550
|
+
if feature_meta.name in features_df.columns:
|
|
3551
|
+
feature_sample = np.random.choice(features_df[feature_meta.name].dropna().unique(), 3).tolist()
|
|
3552
|
+
if len(feature_sample) > 0 and isinstance(feature_sample[0], float):
|
|
3553
|
+
feature_sample = [round(f, 4) for f in feature_sample]
|
|
3554
|
+
feature_sample = [str(f) for f in feature_sample]
|
|
3555
|
+
feature_sample = ", ".join(feature_sample)
|
|
3556
|
+
if len(feature_sample) > 30:
|
|
3557
|
+
feature_sample = feature_sample[:30] + "..."
|
|
3558
|
+
|
|
3559
|
+
internal_provider = feature_meta.data_provider or "Upgini"
|
|
3560
|
+
providers = list_or_single(feature_meta.data_providers, feature_meta.data_provider)
|
|
3561
|
+
provider_links = list_or_single(feature_meta.data_provider_links, feature_meta.data_provider_link)
|
|
3562
|
+
if providers:
|
|
3563
|
+
provider = make_links(providers, provider_links)
|
|
3564
|
+
else:
|
|
3565
|
+
provider = to_anchor("https://upgini.com", "Upgini")
|
|
3570
3566
|
|
|
3571
|
-
|
|
3572
|
-
|
|
3573
|
-
|
|
3574
|
-
|
|
3567
|
+
internal_source = feature_meta.data_source or (
|
|
3568
|
+
llm_source
|
|
3569
|
+
if not feature_meta.name.endswith("_country") and not feature_meta.name.endswith("_postal_code")
|
|
3570
|
+
else ""
|
|
3571
|
+
)
|
|
3572
|
+
sources = list_or_single(feature_meta.data_sources, feature_meta.data_source)
|
|
3573
|
+
source_links = list_or_single(feature_meta.data_source_links, feature_meta.data_source_link)
|
|
3574
|
+
if sources:
|
|
3575
|
+
source = make_links(sources, source_links)
|
|
3576
|
+
else:
|
|
3577
|
+
source = internal_source
|
|
3578
|
+
|
|
3579
|
+
internal_feature_name = feature_meta.name
|
|
3580
|
+
if feature_meta.doc_link:
|
|
3581
|
+
feature_name = to_anchor(feature_meta.doc_link, feature_meta.name)
|
|
3582
|
+
else:
|
|
3583
|
+
feature_name = internal_feature_name
|
|
3584
|
+
|
|
3585
|
+
features_info.append(
|
|
3586
|
+
{
|
|
3587
|
+
self.bundle.get("features_info_name"): feature_name,
|
|
3588
|
+
self.bundle.get("features_info_shap"): self._round_shap_value(feature_meta.shap_value),
|
|
3589
|
+
self.bundle.get("features_info_hitrate"): feature_meta.hit_rate,
|
|
3590
|
+
self.bundle.get("features_info_value_preview"): feature_sample,
|
|
3591
|
+
self.bundle.get("features_info_provider"): provider,
|
|
3592
|
+
self.bundle.get("features_info_source"): source,
|
|
3593
|
+
self.bundle.get("features_info_update_frequency"): feature_meta.update_frequency,
|
|
3594
|
+
}
|
|
3595
|
+
)
|
|
3596
|
+
features_info_without_links.append(
|
|
3597
|
+
{
|
|
3598
|
+
self.bundle.get("features_info_name"): internal_feature_name,
|
|
3599
|
+
self.bundle.get("features_info_shap"): self._round_shap_value(feature_meta.shap_value),
|
|
3600
|
+
self.bundle.get("features_info_hitrate"): feature_meta.hit_rate,
|
|
3601
|
+
self.bundle.get("features_info_value_preview"): feature_sample,
|
|
3602
|
+
self.bundle.get("features_info_provider"): internal_provider,
|
|
3603
|
+
self.bundle.get("features_info_source"): internal_source,
|
|
3604
|
+
self.bundle.get("features_info_update_frequency"): feature_meta.update_frequency,
|
|
3605
|
+
}
|
|
3606
|
+
)
|
|
3607
|
+
internal_features_info.append(
|
|
3608
|
+
{
|
|
3609
|
+
self.bundle.get("features_info_name"): internal_feature_name,
|
|
3610
|
+
"feature_link": feature_meta.doc_link,
|
|
3611
|
+
self.bundle.get("features_info_shap"): self._round_shap_value(feature_meta.shap_value),
|
|
3612
|
+
self.bundle.get("features_info_hitrate"): feature_meta.hit_rate,
|
|
3613
|
+
self.bundle.get("features_info_value_preview"): feature_sample,
|
|
3614
|
+
self.bundle.get("features_info_provider"): internal_provider,
|
|
3615
|
+
"provider_link": feature_meta.data_provider_link,
|
|
3616
|
+
self.bundle.get("features_info_source"): internal_source,
|
|
3617
|
+
"source_link": feature_meta.data_source_link,
|
|
3618
|
+
self.bundle.get("features_info_commercial_schema"): feature_meta.commercial_schema or "",
|
|
3619
|
+
self.bundle.get("features_info_update_frequency"): feature_meta.update_frequency,
|
|
3620
|
+
}
|
|
3621
|
+
)
|
|
3575
3622
|
|
|
3576
3623
|
if len(features_info) > 0:
|
|
3577
3624
|
self.features_info = pd.DataFrame(features_info)
|
|
@@ -3596,22 +3643,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3596
3643
|
autofe_meta = self._search_task.get_autofe_metadata()
|
|
3597
3644
|
if autofe_meta is None:
|
|
3598
3645
|
return None
|
|
3599
|
-
|
|
3600
|
-
|
|
3601
|
-
def to_feature_meta(row):
|
|
3602
|
-
fm = FeaturesMetadataV2(
|
|
3603
|
-
name=row[bundle.get("features_info_name")],
|
|
3604
|
-
type="",
|
|
3605
|
-
source="",
|
|
3606
|
-
hit_rate=bundle.get("features_info_hitrate"),
|
|
3607
|
-
shap_value=bundle.get("features_info_shap"),
|
|
3608
|
-
data_source=bundle.get("features_info_source"),
|
|
3609
|
-
)
|
|
3610
|
-
return fm
|
|
3611
|
-
|
|
3612
|
-
features_meta = self._internal_features_info.apply(to_feature_meta).to_list()
|
|
3613
|
-
else:
|
|
3614
|
-
features_meta = self._search_task.get_all_features_metadata_v2()
|
|
3646
|
+
features_meta = self._search_task.get_all_features_metadata_v2()
|
|
3615
3647
|
|
|
3616
3648
|
def get_feature_by_name(name: str):
|
|
3617
3649
|
for m in features_meta:
|
|
@@ -882,7 +882,7 @@ class _RestClient:
|
|
|
882
882
|
if content_type:
|
|
883
883
|
headers[_RestClient.CONTENT_TYPE_HEADER_NAME] = content_type
|
|
884
884
|
if trace_id:
|
|
885
|
-
headers[_RestClient.TRACE_ID_HEADER_NAME] =
|
|
885
|
+
headers[_RestClient.TRACE_ID_HEADER_NAME] = trace_id
|
|
886
886
|
for header_key, header_value in additional_headers.items():
|
|
887
887
|
headers[header_key] = header_value
|
|
888
888
|
return headers
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
from enum import Enum
|
|
4
|
-
from typing import Any, Dict, List, Optional, Union
|
|
4
|
+
from typing import Any, Dict, List, Optional, Set, Union
|
|
5
5
|
|
|
6
6
|
from pydantic import BaseModel
|
|
7
7
|
|
|
@@ -228,13 +228,13 @@ class FileMetadata(BaseModel):
|
|
|
228
228
|
return c
|
|
229
229
|
return None
|
|
230
230
|
|
|
231
|
-
def search_types(self) ->
|
|
232
|
-
search_keys =
|
|
231
|
+
def search_types(self) -> Set[SearchKey]:
|
|
232
|
+
search_keys = set()
|
|
233
233
|
for keys_group in self.searchKeys:
|
|
234
234
|
for key in keys_group:
|
|
235
235
|
column = self.column_by_name(key)
|
|
236
236
|
if column:
|
|
237
|
-
search_keys
|
|
237
|
+
search_keys.add(SearchKey.from_meaning_type(column.meaningType))
|
|
238
238
|
return search_keys
|
|
239
239
|
|
|
240
240
|
|
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
-
from dataclasses import dataclass
|
|
4
3
|
import inspect
|
|
5
4
|
import logging
|
|
6
5
|
import re
|
|
@@ -211,21 +210,6 @@ SUPPORTED_CATBOOST_METRICS = {
|
|
|
211
210
|
}
|
|
212
211
|
|
|
213
212
|
|
|
214
|
-
@dataclass
|
|
215
|
-
class _CrossValResults:
|
|
216
|
-
metric: Optional[float]
|
|
217
|
-
metric_std: Optional[float]
|
|
218
|
-
shap_values: Optional[Dict[str, float]]
|
|
219
|
-
|
|
220
|
-
def get_display_metric(self) -> Optional[str]:
|
|
221
|
-
if self.metric is None:
|
|
222
|
-
return None
|
|
223
|
-
elif self.metric_std is None:
|
|
224
|
-
return f"{self.metric:.3f}"
|
|
225
|
-
else:
|
|
226
|
-
return f"{self.metric:.3f} ± {self.metric_std:.3f}"
|
|
227
|
-
|
|
228
|
-
|
|
229
213
|
class EstimatorWrapper:
|
|
230
214
|
def __init__(
|
|
231
215
|
self,
|
|
@@ -313,11 +297,11 @@ class EstimatorWrapper:
|
|
|
313
297
|
|
|
314
298
|
def cross_val_predict(
|
|
315
299
|
self, x: pd.DataFrame, y: np.ndarray, baseline_score_column: Optional[Any] = None
|
|
316
|
-
) ->
|
|
300
|
+
) -> Tuple[Optional[float], Optional[Dict[str, float]]]:
|
|
317
301
|
x, y, groups, fit_params = self._prepare_to_fit(x, y)
|
|
318
302
|
|
|
319
303
|
if x.shape[1] == 0:
|
|
320
|
-
return
|
|
304
|
+
return None
|
|
321
305
|
|
|
322
306
|
scorer = check_scoring(self.estimator, scoring=self.scorer)
|
|
323
307
|
|
|
@@ -342,7 +326,7 @@ class EstimatorWrapper:
|
|
|
342
326
|
|
|
343
327
|
self.check_fold_metrics(metrics_by_fold)
|
|
344
328
|
|
|
345
|
-
metric
|
|
329
|
+
metric = np.mean(metrics_by_fold) * self.multiplier
|
|
346
330
|
|
|
347
331
|
splits = self.cv.split(x, y, groups)
|
|
348
332
|
|
|
@@ -367,7 +351,7 @@ class EstimatorWrapper:
|
|
|
367
351
|
else:
|
|
368
352
|
average_shap_values = None
|
|
369
353
|
|
|
370
|
-
return
|
|
354
|
+
return self.post_process_metric(metric), average_shap_values
|
|
371
355
|
|
|
372
356
|
def process_shap_values(self, shap_values: Dict[str, float]) -> Dict[str, float]:
|
|
373
357
|
return shap_values
|
|
@@ -383,25 +367,17 @@ class EstimatorWrapper:
|
|
|
383
367
|
metric = 2 * metric - 1
|
|
384
368
|
return metric
|
|
385
369
|
|
|
386
|
-
def calculate_metric(
|
|
387
|
-
self, x: pd.DataFrame, y: np.ndarray, baseline_score_column: Optional[Any] = None
|
|
388
|
-
) -> _CrossValResults:
|
|
370
|
+
def calculate_metric(self, x: pd.DataFrame, y: np.ndarray, baseline_score_column: Optional[Any] = None) -> float:
|
|
389
371
|
x, y, _ = self._prepare_to_calculate(x, y)
|
|
390
372
|
if baseline_score_column is not None and self.metric_name == "GINI":
|
|
391
|
-
metric
|
|
373
|
+
metric = roc_auc_score(y, x[baseline_score_column])
|
|
392
374
|
else:
|
|
393
375
|
metrics = []
|
|
394
376
|
for est in self.cv_estimators:
|
|
395
377
|
metrics.append(self.scorer(est, x, y))
|
|
396
378
|
|
|
397
|
-
metric
|
|
398
|
-
return
|
|
399
|
-
|
|
400
|
-
def _calculate_metric_from_folds(self, metrics_by_fold: List[float]) -> Tuple[float, float]:
|
|
401
|
-
metrics_by_fold = [self.post_process_metric(m) for m in metrics_by_fold]
|
|
402
|
-
metric = np.mean(metrics_by_fold) * self.multiplier
|
|
403
|
-
metric_std = np.std(metrics_by_fold) * np.abs(self.multiplier)
|
|
404
|
-
return metric, metric_std
|
|
379
|
+
metric = np.mean(metrics) * self.multiplier
|
|
380
|
+
return self.post_process_metric(metric)
|
|
405
381
|
|
|
406
382
|
@staticmethod
|
|
407
383
|
def create(
|
|
@@ -615,7 +591,7 @@ class CatBoostWrapper(EstimatorWrapper):
|
|
|
615
591
|
|
|
616
592
|
def cross_val_predict(
|
|
617
593
|
self, x: pd.DataFrame, y: np.ndarray, baseline_score_column: Optional[Any] = None
|
|
618
|
-
) ->
|
|
594
|
+
) -> Tuple[Optional[float], Optional[Dict[str, float]]]:
|
|
619
595
|
try:
|
|
620
596
|
return super().cross_val_predict(x, y, baseline_score_column)
|
|
621
597
|
except Exception as e:
|
|
@@ -109,63 +109,54 @@ class DateTimeSearchKeyConverter:
|
|
|
109
109
|
|
|
110
110
|
df = self.clean_old_dates(df)
|
|
111
111
|
|
|
112
|
-
# Define function to apply sine and cosine transformations
|
|
113
|
-
def add_cyclical_features(df, column, period):
|
|
114
|
-
period_suffix = f"_{period}" if column != "day_in_quarter" else ""
|
|
115
|
-
sin_feature = f"datetime_{column}_sin{period_suffix}"
|
|
116
|
-
cos_feature = f"datetime_{column}_cos{period_suffix}"
|
|
117
|
-
if sin_feature not in df.columns:
|
|
118
|
-
df[sin_feature] = np.sin(2 * np.pi * df[column] / period)
|
|
119
|
-
self.generated_features.append(sin_feature)
|
|
120
|
-
if cos_feature not in df.columns:
|
|
121
|
-
df[cos_feature] = np.cos(2 * np.pi * df[column] / period)
|
|
122
|
-
self.generated_features.append(cos_feature)
|
|
123
|
-
|
|
124
|
-
df["quarter"] = df[self.date_column].dt.quarter
|
|
125
|
-
|
|
126
|
-
# Calculate the start date of the quarter for each timestamp
|
|
127
|
-
df["quarter_start"] = df[self.date_column].dt.to_period("Q").dt.start_time
|
|
128
|
-
|
|
129
|
-
# Calculate the day in the quarter
|
|
130
|
-
df["day_in_quarter"] = (df[self.date_column] - df["quarter_start"]).dt.days + 1
|
|
131
|
-
|
|
132
|
-
# Vectorized calculation of days_in_quarter
|
|
133
|
-
quarter = df["quarter"]
|
|
134
|
-
start = df["quarter_start"]
|
|
135
|
-
year = start.dt.year
|
|
136
|
-
month = start.dt.month
|
|
137
|
-
|
|
138
|
-
quarter_end_year = np.where(quarter == 4, year + 1, year)
|
|
139
|
-
quarter_end_month = np.where(quarter == 4, 1, month + 3)
|
|
140
|
-
|
|
141
|
-
end = pd.to_datetime({"year": quarter_end_year, "month": quarter_end_month, "day": 1})
|
|
142
|
-
end.index = df.index
|
|
143
|
-
|
|
144
|
-
df["days_in_quarter"] = (end - start).dt.days
|
|
145
|
-
|
|
146
|
-
add_cyclical_features(df, "day_in_quarter", df["days_in_quarter"]) # Days in the quarter
|
|
147
|
-
|
|
148
|
-
df.drop(columns=["quarter", "quarter_start", "day_in_quarter", "days_in_quarter"], inplace=True)
|
|
149
|
-
|
|
150
112
|
df[seconds] = (df[self.date_column] - df[self.date_column].dt.floor("D")).dt.seconds
|
|
151
113
|
|
|
152
114
|
seconds_without_na = df[seconds].dropna()
|
|
153
115
|
if (seconds_without_na != 0).any() and seconds_without_na.nunique() > 1:
|
|
154
116
|
self.logger.info("Time found in date search key. Add extra features based on time")
|
|
155
117
|
|
|
156
|
-
# Extract
|
|
118
|
+
# Extract time components
|
|
157
119
|
df["second"] = df[self.date_column].dt.second
|
|
158
120
|
df["minute"] = df[self.date_column].dt.minute
|
|
159
121
|
df["hour"] = df[self.date_column].dt.hour
|
|
122
|
+
df["day"] = df[self.date_column].dt.day
|
|
123
|
+
df["month"] = df[self.date_column].dt.month
|
|
124
|
+
|
|
125
|
+
# Get the actual number of days in each month
|
|
126
|
+
df["days_in_month"] = df[self.date_column].dt.days_in_month
|
|
127
|
+
|
|
128
|
+
# Define function to apply sine and cosine transformations
|
|
129
|
+
def add_cyclical_features(df, column, period):
|
|
130
|
+
sin_feature = f"datetime_{column}_sin_{period}"
|
|
131
|
+
cos_feature = f"datetime_{column}_cos_{period}"
|
|
132
|
+
df[sin_feature] = np.sin(2 * np.pi * df[column] / period)
|
|
133
|
+
df[cos_feature] = np.cos(2 * np.pi * df[column] / period)
|
|
134
|
+
self.generated_features.append(sin_feature)
|
|
135
|
+
self.generated_features.append(cos_feature)
|
|
160
136
|
|
|
161
|
-
# Apply
|
|
137
|
+
# Apply transformations using vectorized operations
|
|
162
138
|
add_cyclical_features(df, "second", 60) # Seconds in a minute
|
|
163
139
|
add_cyclical_features(df, "minute", 60) # Minutes in an hour
|
|
164
|
-
add_cyclical_features(df, "minute", 30) # Minutes in half an hour
|
|
165
140
|
add_cyclical_features(df, "hour", 24) # Hours in a day
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
141
|
+
add_cyclical_features(df, "day", df["days_in_month"]) # Days in the specific month
|
|
142
|
+
add_cyclical_features(df, "month", 12) # Months in a year
|
|
143
|
+
|
|
144
|
+
# Extract quarter information
|
|
145
|
+
df["quarter"] = df[self.date_column].dt.quarter
|
|
146
|
+
|
|
147
|
+
# Apply transformations for quarters
|
|
148
|
+
add_cyclical_features(df, "quarter", 4) # Quarters in a year
|
|
149
|
+
# seconds_in_day = 60 * 60 * 24
|
|
150
|
+
# orders = [1, 2, 24, 48]
|
|
151
|
+
# for order in orders:
|
|
152
|
+
# sin_feature = f"datetime_time_sin_{order}"
|
|
153
|
+
# cos_feature = f"datetime_time_cos_{order}"
|
|
154
|
+
# df[sin_feature] = np.round(np.sin(2 * np.pi * order * df[seconds] / seconds_in_day), 10)
|
|
155
|
+
# df[cos_feature] = np.round(np.cos(2 * np.pi * order * df[seconds] / seconds_in_day), 10)
|
|
156
|
+
# self.generated_features.append(sin_feature)
|
|
157
|
+
# self.generated_features.append(cos_feature)
|
|
158
|
+
|
|
159
|
+
df.drop(columns=["second", "minute", "hour", "day", "month", "days_in_month", "quarter"])
|
|
169
160
|
|
|
170
161
|
df.drop(columns=seconds, inplace=True)
|
|
171
162
|
|
|
@@ -38,9 +38,8 @@ class EmailDomainGenerator:
|
|
|
38
38
|
def generate(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
39
39
|
for email_col in self.email_columns:
|
|
40
40
|
domain_feature = email_col + self.DOMAIN_SUFFIX
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
self.generated_features.append(domain_feature)
|
|
41
|
+
df[domain_feature] = df[email_col].apply(self._email_to_domain)
|
|
42
|
+
self.generated_features.append(domain_feature)
|
|
44
43
|
return df
|
|
45
44
|
|
|
46
45
|
@staticmethod
|
|
@@ -2,7 +2,6 @@ import logging
|
|
|
2
2
|
from logging import Logger
|
|
3
3
|
from typing import Dict, List, Optional, Tuple
|
|
4
4
|
|
|
5
|
-
import numpy as np
|
|
6
5
|
import pandas as pd
|
|
7
6
|
from pandas.api.types import is_integer_dtype, is_object_dtype, is_string_dtype
|
|
8
7
|
|
|
@@ -84,21 +83,10 @@ class FeaturesValidator:
|
|
|
84
83
|
return [
|
|
85
84
|
i
|
|
86
85
|
for i in df
|
|
87
|
-
if (is_object_dtype(df[i]) or is_string_dtype(df[i]) or
|
|
86
|
+
if (is_object_dtype(df[i]) or is_string_dtype(df[i]) or is_integer_dtype(df[i]))
|
|
88
87
|
and (df[i].nunique(dropna=False) / row_count >= 0.85)
|
|
89
88
|
]
|
|
90
89
|
|
|
91
|
-
@staticmethod
|
|
92
|
-
def __is_integer(series: pd.Series) -> bool:
|
|
93
|
-
return (
|
|
94
|
-
is_integer_dtype(series)
|
|
95
|
-
or series.dropna()
|
|
96
|
-
.apply(
|
|
97
|
-
lambda f: (float.is_integer(f) and abs(f) < np.iinfo(np.int64).max) if isinstance(f, float) else False
|
|
98
|
-
)
|
|
99
|
-
.all()
|
|
100
|
-
)
|
|
101
|
-
|
|
102
90
|
@staticmethod
|
|
103
91
|
def find_constant_features(df: pd.DataFrame) -> List[str]:
|
|
104
92
|
return [i for i in df if df[i].nunique() <= 1]
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
__version__ = "1.2.29"
|
|
@@ -1,172 +0,0 @@
|
|
|
1
|
-
from dataclasses import dataclass
|
|
2
|
-
import itertools
|
|
3
|
-
from typing import Dict, List
|
|
4
|
-
|
|
5
|
-
import numpy as np
|
|
6
|
-
import pandas as pd
|
|
7
|
-
|
|
8
|
-
from upgini.metadata import FeaturesMetadataV2
|
|
9
|
-
from upgini.resource_bundle import ResourceBundle
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
LLM_SOURCE = "LLM with external data augmentation"
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
@dataclass
|
|
16
|
-
class FeatureInfo:
|
|
17
|
-
name: str
|
|
18
|
-
internal_name: str
|
|
19
|
-
rounded_shap: float
|
|
20
|
-
hitrate: float
|
|
21
|
-
value_preview: str
|
|
22
|
-
provider: str
|
|
23
|
-
internal_provider: str
|
|
24
|
-
source: str
|
|
25
|
-
internal_source: str
|
|
26
|
-
update_frequency: str
|
|
27
|
-
commercial_schema: str
|
|
28
|
-
doc_link: str
|
|
29
|
-
data_provider_link: str
|
|
30
|
-
data_source_link: str
|
|
31
|
-
|
|
32
|
-
@staticmethod
|
|
33
|
-
def from_metadata(feature_meta: FeaturesMetadataV2, data: pd.DataFrame, is_client_feature: bool) -> "FeatureInfo":
|
|
34
|
-
return FeatureInfo(
|
|
35
|
-
name=_get_name(feature_meta),
|
|
36
|
-
internal_name=_get_internal_name(feature_meta),
|
|
37
|
-
rounded_shap=_round_shap_value(feature_meta.shap_value),
|
|
38
|
-
hitrate=feature_meta.hit_rate,
|
|
39
|
-
value_preview=_get_feature_sample(feature_meta, data),
|
|
40
|
-
provider=_get_provider(feature_meta, is_client_feature),
|
|
41
|
-
internal_provider=_get_internal_provider(feature_meta, is_client_feature),
|
|
42
|
-
source=_get_source(feature_meta, is_client_feature),
|
|
43
|
-
internal_source=_get_internal_source(feature_meta, is_client_feature),
|
|
44
|
-
update_frequency=feature_meta.update_frequency,
|
|
45
|
-
commercial_schema=feature_meta.commercial_schema,
|
|
46
|
-
doc_link=feature_meta.doc_link,
|
|
47
|
-
data_provider_link=feature_meta.data_provider_link,
|
|
48
|
-
data_source_link=feature_meta.data_source_link,
|
|
49
|
-
)
|
|
50
|
-
|
|
51
|
-
def to_row(self, bundle: ResourceBundle) -> Dict[str, str]:
|
|
52
|
-
return {
|
|
53
|
-
bundle.get("features_info_name"): self.name,
|
|
54
|
-
bundle.get("features_info_shap"): self.rounded_shap,
|
|
55
|
-
bundle.get("features_info_hitrate"): self.hitrate,
|
|
56
|
-
bundle.get("features_info_value_preview"): self.value_preview,
|
|
57
|
-
bundle.get("features_info_provider"): self.provider,
|
|
58
|
-
bundle.get("features_info_source"): self.source,
|
|
59
|
-
bundle.get("features_info_update_frequency"): self.update_frequency,
|
|
60
|
-
}
|
|
61
|
-
|
|
62
|
-
def to_row_without_links(self, bundle: ResourceBundle) -> Dict[str, str]:
|
|
63
|
-
return {
|
|
64
|
-
bundle.get("features_info_name"): self.internal_name,
|
|
65
|
-
bundle.get("features_info_shap"): self.rounded_shap,
|
|
66
|
-
bundle.get("features_info_hitrate"): self.hitrate,
|
|
67
|
-
bundle.get("features_info_value_preview"): self.value_preview,
|
|
68
|
-
bundle.get("features_info_provider"): self.internal_provider,
|
|
69
|
-
bundle.get("features_info_source"): self.internal_source,
|
|
70
|
-
bundle.get("features_info_update_frequency"): self.update_frequency,
|
|
71
|
-
}
|
|
72
|
-
|
|
73
|
-
def to_internal_row(self, bundle: ResourceBundle) -> Dict[str, str]:
|
|
74
|
-
return {
|
|
75
|
-
bundle.get("features_info_name"): self.internal_name,
|
|
76
|
-
"feature_link": self.doc_link,
|
|
77
|
-
bundle.get("features_info_shap"): self.rounded_shap,
|
|
78
|
-
bundle.get("features_info_hitrate"): self.hitrate,
|
|
79
|
-
bundle.get("features_info_value_preview"): self.value_preview,
|
|
80
|
-
bundle.get("features_info_provider"): self.internal_provider,
|
|
81
|
-
"provider_link": self.data_provider_link,
|
|
82
|
-
bundle.get("features_info_source"): self.internal_source,
|
|
83
|
-
"source_link": self.data_source_link,
|
|
84
|
-
bundle.get("features_info_commercial_schema"): self.commercial_schema or "",
|
|
85
|
-
bundle.get("features_info_update_frequency"): self.update_frequency,
|
|
86
|
-
}
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
def _get_feature_sample(feature_meta: FeaturesMetadataV2, data: pd.DataFrame) -> str:
|
|
90
|
-
if feature_meta.name in data.columns:
|
|
91
|
-
feature_sample = np.random.choice(data[feature_meta.name].dropna().unique(), 3).tolist()
|
|
92
|
-
if len(feature_sample) > 0 and isinstance(feature_sample[0], float):
|
|
93
|
-
feature_sample = [round(f, 4) for f in feature_sample]
|
|
94
|
-
feature_sample = [str(f) for f in feature_sample]
|
|
95
|
-
feature_sample = ", ".join(feature_sample)
|
|
96
|
-
if len(feature_sample) > 30:
|
|
97
|
-
feature_sample = feature_sample[:30] + "..."
|
|
98
|
-
else:
|
|
99
|
-
feature_sample = ""
|
|
100
|
-
return feature_sample
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
def _get_name(feature_meta: FeaturesMetadataV2) -> str:
|
|
104
|
-
if feature_meta.doc_link:
|
|
105
|
-
return _to_anchor(feature_meta.doc_link, feature_meta.name)
|
|
106
|
-
else:
|
|
107
|
-
return feature_meta.name
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
def _get_internal_name(feature_meta: FeaturesMetadataV2) -> str:
|
|
111
|
-
return feature_meta.name
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
def _get_provider(feature_meta: FeaturesMetadataV2, is_client_feature: bool) -> str:
|
|
115
|
-
providers = _list_or_single(feature_meta.data_providers, feature_meta.data_provider)
|
|
116
|
-
provider_links = _list_or_single(feature_meta.data_provider_links, feature_meta.data_provider_link)
|
|
117
|
-
if providers:
|
|
118
|
-
provider = _make_links(providers, provider_links)
|
|
119
|
-
else:
|
|
120
|
-
provider = "" if is_client_feature else _to_anchor("https://upgini.com", "Upgini")
|
|
121
|
-
return provider
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
def _get_internal_provider(feature_meta: FeaturesMetadataV2, is_client_feature: bool) -> str:
|
|
125
|
-
return "" if is_client_feature else (feature_meta.data_provider or "Upgini")
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
def _get_source(feature_meta: FeaturesMetadataV2, is_client_feature: bool) -> str:
|
|
129
|
-
sources = _list_or_single(feature_meta.data_sources, feature_meta.data_source)
|
|
130
|
-
source_links = _list_or_single(feature_meta.data_source_links, feature_meta.data_source_link)
|
|
131
|
-
if sources:
|
|
132
|
-
source = _make_links(sources, source_links)
|
|
133
|
-
else:
|
|
134
|
-
source = _get_internal_source(feature_meta, is_client_feature)
|
|
135
|
-
return source
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
def _get_internal_source(feature_meta: FeaturesMetadataV2, is_client_feature: bool) -> str:
|
|
139
|
-
return feature_meta.data_source or (
|
|
140
|
-
LLM_SOURCE
|
|
141
|
-
if not feature_meta.name.endswith("_country")
|
|
142
|
-
and not feature_meta.name.endswith("_postal_code")
|
|
143
|
-
and not is_client_feature
|
|
144
|
-
else ""
|
|
145
|
-
)
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
def _list_or_single(lst: List[str], single: str):
|
|
149
|
-
return lst or ([single] if single else [])
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
def _to_anchor(link: str, value: str) -> str:
|
|
153
|
-
if not value:
|
|
154
|
-
return ""
|
|
155
|
-
elif not link:
|
|
156
|
-
return value
|
|
157
|
-
elif value == LLM_SOURCE:
|
|
158
|
-
return value
|
|
159
|
-
else:
|
|
160
|
-
return f"<a href='{link}' target='_blank' rel='noopener noreferrer'>{value}</a>"
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
def _make_links(names: List[str], links: List[str]):
|
|
164
|
-
all_links = [_to_anchor(link, name) for name, link in itertools.zip_longest(names, links)]
|
|
165
|
-
return ",".join(all_links)
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
def _round_shap_value(shap: float) -> float:
|
|
169
|
-
if shap > 0.0 and shap < 0.0001:
|
|
170
|
-
return 0.0001
|
|
171
|
-
else:
|
|
172
|
-
return round(shap, 4)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|