upgini 1.2.29a7__py3-none-any.whl → 1.2.30__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- upgini/__about__.py +1 -1
- upgini/features_enricher.py +77 -47
- upgini/metadata.py +4 -4
- upgini/resource_bundle/strings.properties +8 -0
- {upgini-1.2.29a7.dist-info → upgini-1.2.30.dist-info}/METADATA +1 -1
- {upgini-1.2.29a7.dist-info → upgini-1.2.30.dist-info}/RECORD +8 -8
- {upgini-1.2.29a7.dist-info → upgini-1.2.30.dist-info}/WHEEL +0 -0
- {upgini-1.2.29a7.dist-info → upgini-1.2.30.dist-info}/licenses/LICENSE +0 -0
upgini/__about__.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "1.2.
|
|
1
|
+
__version__ = "1.2.30"
|
upgini/features_enricher.py
CHANGED
|
@@ -54,6 +54,7 @@ from upgini.metadata import (
|
|
|
54
54
|
SYSTEM_RECORD_ID,
|
|
55
55
|
TARGET,
|
|
56
56
|
CVType,
|
|
57
|
+
FeaturesMetadataV2,
|
|
57
58
|
FileColumnMeaningType,
|
|
58
59
|
ModelTaskType,
|
|
59
60
|
RuntimeParameters,
|
|
@@ -349,6 +350,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
349
350
|
self.add_date_if_missing = add_date_if_missing
|
|
350
351
|
self.features_info_display_handle = None
|
|
351
352
|
self.data_sources_display_handle = None
|
|
353
|
+
self.autofe_features_display_handle = None
|
|
352
354
|
self.report_button_handle = None
|
|
353
355
|
|
|
354
356
|
def _get_api_key(self):
|
|
@@ -1048,7 +1050,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1048
1050
|
enriched_shaps = enriched_cv_result.shap_values
|
|
1049
1051
|
|
|
1050
1052
|
if enriched_shaps is not None:
|
|
1051
|
-
self._update_shap_values(enriched_shaps)
|
|
1053
|
+
self._update_shap_values(trace_id, validated_X.columns.to_list(), enriched_shaps)
|
|
1052
1054
|
|
|
1053
1055
|
if enriched_metric is None:
|
|
1054
1056
|
self.logger.warning(
|
|
@@ -1208,37 +1210,11 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1208
1210
|
finally:
|
|
1209
1211
|
self.logger.info(f"Calculating metrics elapsed time: {time.time() - start_time}")
|
|
1210
1212
|
|
|
1211
|
-
def _update_shap_values(self, new_shaps: Dict[str, float]):
|
|
1213
|
+
def _update_shap_values(self, trace_id: str, x_columns: List[str], new_shaps: Dict[str, float]):
|
|
1212
1214
|
new_shaps = {
|
|
1213
1215
|
feature: _round_shap_value(shap) for feature, shap in new_shaps.items() if feature in self.feature_names_
|
|
1214
1216
|
}
|
|
1215
|
-
|
|
1216
|
-
features_importances.sort(key=lambda m: (-m[1], m[0]))
|
|
1217
|
-
self.feature_names_, self.feature_importances_ = zip(*features_importances)
|
|
1218
|
-
self.feature_names_ = list(self.feature_names_)
|
|
1219
|
-
self.feature_importances_ = list(self.feature_importances_)
|
|
1220
|
-
|
|
1221
|
-
feature_name_header = self.bundle.get("features_info_name")
|
|
1222
|
-
shap_value_header = self.bundle.get("features_info_shap")
|
|
1223
|
-
|
|
1224
|
-
def update_shap(row):
|
|
1225
|
-
return new_shaps.get(row[feature_name_header], row[shap_value_header])
|
|
1226
|
-
|
|
1227
|
-
self.features_info[shap_value_header] = self.features_info.apply(update_shap, axis=1)
|
|
1228
|
-
self._internal_features_info[shap_value_header] = self._internal_features_info.apply(update_shap, axis=1)
|
|
1229
|
-
self._features_info_without_links[shap_value_header] = self._features_info_without_links.apply(
|
|
1230
|
-
update_shap, axis=1
|
|
1231
|
-
)
|
|
1232
|
-
self.logger.info(f"Recalculated SHAP values:\n{self._features_info_without_links}")
|
|
1233
|
-
|
|
1234
|
-
self.features_info.sort_values(by=shap_value_header, ascending=False, inplace=True)
|
|
1235
|
-
self._internal_features_info.sort_values(by=shap_value_header, ascending=False, inplace=True)
|
|
1236
|
-
self._features_info_without_links.sort_values(by=shap_value_header, ascending=False, inplace=True)
|
|
1237
|
-
|
|
1238
|
-
self.relevant_data_sources = self._group_relevant_data_sources(self.features_info, self.bundle)
|
|
1239
|
-
self._relevant_data_sources_wo_links = self._group_relevant_data_sources(
|
|
1240
|
-
self._features_info_without_links, self.bundle
|
|
1241
|
-
)
|
|
1217
|
+
self.__prepare_feature_importances(trace_id, x_columns, new_shaps, silent=True)
|
|
1242
1218
|
|
|
1243
1219
|
if self.features_info_display_handle is not None:
|
|
1244
1220
|
try:
|
|
@@ -1251,7 +1227,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1251
1227
|
display_handle=self.features_info_display_handle,
|
|
1252
1228
|
)
|
|
1253
1229
|
except (ImportError, NameError):
|
|
1254
|
-
|
|
1230
|
+
pass
|
|
1255
1231
|
if self.data_sources_display_handle is not None:
|
|
1256
1232
|
try:
|
|
1257
1233
|
_ = get_ipython() # type: ignore
|
|
@@ -1259,11 +1235,24 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1259
1235
|
display_html_dataframe(
|
|
1260
1236
|
self.relevant_data_sources,
|
|
1261
1237
|
self._relevant_data_sources_wo_links,
|
|
1262
|
-
self.bundle.get("
|
|
1238
|
+
self.bundle.get("relevant_data_sources_header"),
|
|
1263
1239
|
display_handle=self.data_sources_display_handle,
|
|
1264
1240
|
)
|
|
1265
1241
|
except (ImportError, NameError):
|
|
1266
|
-
|
|
1242
|
+
pass
|
|
1243
|
+
if self.autofe_features_display_handle is not None:
|
|
1244
|
+
try:
|
|
1245
|
+
_ = get_ipython() # type: ignore
|
|
1246
|
+
autofe_descriptions_df = self.get_autofe_features_description()
|
|
1247
|
+
if autofe_descriptions_df is not None:
|
|
1248
|
+
display_html_dataframe(
|
|
1249
|
+
df=autofe_descriptions_df,
|
|
1250
|
+
internal_df=autofe_descriptions_df,
|
|
1251
|
+
header=self.bundle.get("autofe_descriptions_header"),
|
|
1252
|
+
display_handle=self.autofe_features_display_handle,
|
|
1253
|
+
)
|
|
1254
|
+
except (ImportError, NameError):
|
|
1255
|
+
pass
|
|
1267
1256
|
if self.report_button_handle is not None:
|
|
1268
1257
|
try:
|
|
1269
1258
|
_ = get_ipython() # type: ignore
|
|
@@ -1999,9 +1988,19 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1999
1988
|
file_metadata = self._search_task.get_file_metadata(str(uuid.uuid4()))
|
|
2000
1989
|
search_keys = file_metadata.search_types()
|
|
2001
1990
|
if SearchKey.IPV6_ADDRESS in search_keys:
|
|
2002
|
-
search_keys.remove(SearchKey.IPV6_ADDRESS)
|
|
1991
|
+
# search_keys.remove(SearchKey.IPV6_ADDRESS)
|
|
1992
|
+
search_keys.pop(SearchKey.IPV6_ADDRESS, None)
|
|
2003
1993
|
|
|
2004
|
-
keys =
|
|
1994
|
+
keys = (
|
|
1995
|
+
"{"
|
|
1996
|
+
+ ", ".join(
|
|
1997
|
+
[
|
|
1998
|
+
f'"{key.name}": {{"name": "{name}", "value": "{key_example(key)}"}}'
|
|
1999
|
+
for key, name in search_keys.items()
|
|
2000
|
+
]
|
|
2001
|
+
)
|
|
2002
|
+
+ "}"
|
|
2003
|
+
)
|
|
2005
2004
|
features_for_transform = self._search_task.get_features_for_transform()
|
|
2006
2005
|
if features_for_transform:
|
|
2007
2006
|
original_features_for_transform = [
|
|
@@ -2804,7 +2803,12 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2804
2803
|
autofe_description = self.get_autofe_features_description()
|
|
2805
2804
|
if autofe_description is not None:
|
|
2806
2805
|
self.logger.info(f"AutoFE descriptions: {autofe_description}")
|
|
2807
|
-
display_html_dataframe(
|
|
2806
|
+
self.autofe_features_display_handle = display_html_dataframe(
|
|
2807
|
+
df=autofe_description,
|
|
2808
|
+
internal_df=autofe_description,
|
|
2809
|
+
header=self.bundle.get("autofe_descriptions_header"),
|
|
2810
|
+
display_id="autofe_descriptions",
|
|
2811
|
+
)
|
|
2808
2812
|
|
|
2809
2813
|
if self._has_paid_features(exclude_features_sources):
|
|
2810
2814
|
if calculate_metrics is not None and calculate_metrics:
|
|
@@ -3516,7 +3520,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3516
3520
|
|
|
3517
3521
|
return result_train, result_eval_sets
|
|
3518
3522
|
|
|
3519
|
-
def __prepare_feature_importances(
|
|
3523
|
+
def __prepare_feature_importances(
|
|
3524
|
+
self, trace_id: str, x_columns: List[str], updated_shaps: Optional[Dict[str, float]] = None, silent=False
|
|
3525
|
+
):
|
|
3520
3526
|
if self._search_task is None:
|
|
3521
3527
|
raise NotFittedError(self.bundle.get("transform_unfitted_enricher"))
|
|
3522
3528
|
features_meta = self._search_task.get_all_features_metadata_v2()
|
|
@@ -3533,6 +3539,10 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3533
3539
|
features_info_without_links = []
|
|
3534
3540
|
internal_features_info = []
|
|
3535
3541
|
|
|
3542
|
+
if updated_shaps is not None:
|
|
3543
|
+
for fm in features_meta:
|
|
3544
|
+
fm.shap_value = updated_shaps.get(fm.name, 0.0)
|
|
3545
|
+
|
|
3536
3546
|
features_meta.sort(key=lambda m: (-m.shap_value, m.name))
|
|
3537
3547
|
for feature_meta in features_meta:
|
|
3538
3548
|
if feature_meta.name in original_names_dict.keys():
|
|
@@ -3585,7 +3595,22 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3585
3595
|
autofe_meta = self._search_task.get_autofe_metadata()
|
|
3586
3596
|
if autofe_meta is None:
|
|
3587
3597
|
return None
|
|
3588
|
-
|
|
3598
|
+
if len(self._internal_features_info) != 0:
|
|
3599
|
+
|
|
3600
|
+
def to_feature_meta(row):
|
|
3601
|
+
fm = FeaturesMetadataV2(
|
|
3602
|
+
name=row[bundle.get("features_info_name")],
|
|
3603
|
+
type="",
|
|
3604
|
+
source="",
|
|
3605
|
+
hit_rate=row[bundle.get("features_info_hitrate")],
|
|
3606
|
+
shap_value=row[bundle.get("features_info_shap")],
|
|
3607
|
+
data_source=row[bundle.get("features_info_source")],
|
|
3608
|
+
)
|
|
3609
|
+
return fm
|
|
3610
|
+
|
|
3611
|
+
features_meta = self._internal_features_info.apply(to_feature_meta, axis=1).to_list()
|
|
3612
|
+
else:
|
|
3613
|
+
features_meta = self._search_task.get_all_features_metadata_v2()
|
|
3589
3614
|
|
|
3590
3615
|
def get_feature_by_name(name: str):
|
|
3591
3616
|
for m in features_meta:
|
|
@@ -3614,27 +3639,32 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3614
3639
|
self.logger.warning(f"Feature meta for display index {m.display_index} not found")
|
|
3615
3640
|
continue
|
|
3616
3641
|
description["shap"] = feature_meta.shap_value
|
|
3617
|
-
description["
|
|
3618
|
-
"AutoFE:
|
|
3619
|
-
)
|
|
3620
|
-
description["
|
|
3642
|
+
description[self.bundle.get("autofe_descriptions_sources")] = feature_meta.data_source.replace(
|
|
3643
|
+
"AutoFE: features from ", ""
|
|
3644
|
+
).replace("AutoFE: feature from ", "")
|
|
3645
|
+
description[self.bundle.get("autofe_descriptions_feature_name")] = feature_meta.name
|
|
3621
3646
|
|
|
3622
3647
|
feature_idx = 1
|
|
3623
3648
|
for bc in m.base_columns:
|
|
3624
|
-
description[
|
|
3649
|
+
description[self.bundle.get("autofe_descriptions_feature").format(feature_idx)] = bc.hashed_name
|
|
3625
3650
|
feature_idx += 1
|
|
3626
3651
|
|
|
3627
|
-
description["
|
|
3652
|
+
description[self.bundle.get("autofe_descriptions_function")] = ",".join(
|
|
3653
|
+
sorted(autofe_feature.get_all_operand_names())
|
|
3654
|
+
)
|
|
3628
3655
|
|
|
3629
3656
|
descriptions.append(description)
|
|
3630
3657
|
|
|
3631
3658
|
if len(descriptions) == 0:
|
|
3632
3659
|
return None
|
|
3633
3660
|
|
|
3634
|
-
descriptions_df =
|
|
3635
|
-
|
|
3636
|
-
|
|
3637
|
-
|
|
3661
|
+
descriptions_df = (
|
|
3662
|
+
pd.DataFrame(descriptions)
|
|
3663
|
+
.fillna("")
|
|
3664
|
+
.sort_values(by="shap", ascending=False)
|
|
3665
|
+
.drop(columns="shap")
|
|
3666
|
+
.reset_index(drop=True)
|
|
3667
|
+
)
|
|
3638
3668
|
return descriptions_df
|
|
3639
3669
|
|
|
3640
3670
|
except Exception:
|
upgini/metadata.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
from enum import Enum
|
|
4
|
-
from typing import Any, Dict, List, Optional,
|
|
4
|
+
from typing import Any, Dict, List, Optional, Union
|
|
5
5
|
|
|
6
6
|
from pydantic import BaseModel
|
|
7
7
|
|
|
@@ -228,13 +228,13 @@ class FileMetadata(BaseModel):
|
|
|
228
228
|
return c
|
|
229
229
|
return None
|
|
230
230
|
|
|
231
|
-
def search_types(self) ->
|
|
232
|
-
search_keys =
|
|
231
|
+
def search_types(self) -> Dict[SearchKey, str]:
|
|
232
|
+
search_keys = dict()
|
|
233
233
|
for keys_group in self.searchKeys:
|
|
234
234
|
for key in keys_group:
|
|
235
235
|
column = self.column_by_name(key)
|
|
236
236
|
if column:
|
|
237
|
-
search_keys
|
|
237
|
+
search_keys[SearchKey.from_meaning_type(column.meaningType)] = column.name
|
|
238
238
|
return search_keys
|
|
239
239
|
|
|
240
240
|
|
|
@@ -251,6 +251,14 @@ relevant_data_sources_header=Relevant data sources
|
|
|
251
251
|
relevant_data_sources_all_shap=All features SHAP
|
|
252
252
|
relevant_data_sources_number=Number of relevant features
|
|
253
253
|
|
|
254
|
+
# Autofe descriptions
|
|
255
|
+
autofe_descriptions_header=*Description of AutoFE feature names
|
|
256
|
+
autofe_descriptions_sources=Sources
|
|
257
|
+
autofe_descriptions_feature_name=Feature name
|
|
258
|
+
autofe_descriptions_feature=Feature {}
|
|
259
|
+
autofe_descriptions_function=Function
|
|
260
|
+
|
|
261
|
+
|
|
254
262
|
# Quality metrics table
|
|
255
263
|
quality_metrics_header=Accuracy after enrichment
|
|
256
264
|
quality_metrics_train_segment=Train
|
|
@@ -1,12 +1,12 @@
|
|
|
1
|
-
upgini/__about__.py,sha256=
|
|
1
|
+
upgini/__about__.py,sha256=WGj1CvVJizDkAvd9BtLpwcsI-hzacJoXGbC8sVpoHYk,23
|
|
2
2
|
upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
|
|
3
3
|
upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
|
|
4
4
|
upgini/dataset.py,sha256=iPFiMJtk4HF1ytw9wCQr8H9RfoOKj_TIo8XYZKWgcMc,31331
|
|
5
5
|
upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
|
|
6
|
-
upgini/features_enricher.py,sha256=
|
|
6
|
+
upgini/features_enricher.py,sha256=lNfu5Z40NmkkGJScKAwe_0VBtL8liePifuAlKE_flfA,192053
|
|
7
7
|
upgini/http.py,sha256=plZGTGoi1h2edd8Cnjt4eYB8t4NbBGnZz7DtPTByiNc,42885
|
|
8
8
|
upgini/lazy_import.py,sha256=74gQ8JuA48BGRLxAo7lNHNKY2D2emMxrUxKGdxVGhuY,1012
|
|
9
|
-
upgini/metadata.py,sha256=
|
|
9
|
+
upgini/metadata.py,sha256=lUa2xYhBhnCeTqNt6lWc9iP_YuikYGIsDSn8Vwyjv1I,11235
|
|
10
10
|
upgini/metrics.py,sha256=hr7UwLphbZ_FEglLuO2lzr_pFgxOJ4c3WBeg7H-fNqY,35521
|
|
11
11
|
upgini/search_task.py,sha256=qxUxAD-bed-FpZYmTB_4orW7YJsW_O6a1TcgnZIRFr4,17307
|
|
12
12
|
upgini/spinner.py,sha256=4iMd-eIe_BnkqFEMIliULTbj6rNI2HkN_VJ4qYe0cUc,1118
|
|
@@ -30,7 +30,7 @@ upgini/normalizer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU
|
|
|
30
30
|
upgini/normalizer/normalize_utils.py,sha256=Ft2MwSgVoBilXAORAOYAuwPD79GOLfwn4qQE3IUFzzg,7218
|
|
31
31
|
upgini/resource_bundle/__init__.py,sha256=S5F2G47pnJd2LDpmFsjDqEwiKkP8Hm-hcseDbMka6Ko,8345
|
|
32
32
|
upgini/resource_bundle/exceptions.py,sha256=5fRvx0_vWdE1-7HcSgF0tckB4A9AKyf5RiinZkInTsI,621
|
|
33
|
-
upgini/resource_bundle/strings.properties,sha256=
|
|
33
|
+
upgini/resource_bundle/strings.properties,sha256=bKw_rjZZTomLJhQBqiM7_P2EoRq45_Ng2gP4WE6MRBE,26921
|
|
34
34
|
upgini/resource_bundle/strings_widget.properties,sha256=gOdqvZWntP2LCza_tyVk1_yRYcG4c04K9sQOAVhF_gw,1577
|
|
35
35
|
upgini/sampler/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
36
36
|
upgini/sampler/base.py,sha256=7GpjYqjOp58vYcJLiX__1R5wjUlyQbxvHJ2klFnup_M,6389
|
|
@@ -59,7 +59,7 @@ upgini/utils/sklearn_ext.py,sha256=13jQS_k7v0aUtudXV6nGUEWjttPQzAW9AFYL5wgEz9k,4
|
|
|
59
59
|
upgini/utils/target_utils.py,sha256=PU77nIhTz7IHbC4rpTpxrVxib6cdpRL9F1dhkjIffLY,10225
|
|
60
60
|
upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
|
|
61
61
|
upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
|
|
62
|
-
upgini-1.2.
|
|
63
|
-
upgini-1.2.
|
|
64
|
-
upgini-1.2.
|
|
65
|
-
upgini-1.2.
|
|
62
|
+
upgini-1.2.30.dist-info/METADATA,sha256=93iMDL28nXF2DJaDP-oUS2CeCFHgnjk2zmAhge2LAHg,48578
|
|
63
|
+
upgini-1.2.30.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
|
|
64
|
+
upgini-1.2.30.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
|
|
65
|
+
upgini-1.2.30.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|