upgini 1.2.29a6__py3-none-any.whl → 1.2.30__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- upgini/__about__.py +1 -1
- upgini/features_enricher.py +97 -58
- upgini/http.py +1 -1
- upgini/metadata.py +4 -4
- upgini/resource_bundle/strings.properties +8 -0
- upgini/utils/datetime_utils.py +6 -4
- upgini/utils/email_utils.py +3 -2
- upgini/utils/features_validator.py +13 -1
- {upgini-1.2.29a6.dist-info → upgini-1.2.30.dist-info}/METADATA +1 -1
- {upgini-1.2.29a6.dist-info → upgini-1.2.30.dist-info}/RECORD +12 -12
- {upgini-1.2.29a6.dist-info → upgini-1.2.30.dist-info}/WHEEL +1 -1
- {upgini-1.2.29a6.dist-info → upgini-1.2.30.dist-info}/licenses/LICENSE +0 -0
upgini/__about__.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "1.2.
|
|
1
|
+
__version__ = "1.2.30"
|
upgini/features_enricher.py
CHANGED
|
@@ -350,6 +350,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
350
350
|
self.add_date_if_missing = add_date_if_missing
|
|
351
351
|
self.features_info_display_handle = None
|
|
352
352
|
self.data_sources_display_handle = None
|
|
353
|
+
self.autofe_features_display_handle = None
|
|
353
354
|
self.report_button_handle = None
|
|
354
355
|
|
|
355
356
|
def _get_api_key(self):
|
|
@@ -1049,7 +1050,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1049
1050
|
enriched_shaps = enriched_cv_result.shap_values
|
|
1050
1051
|
|
|
1051
1052
|
if enriched_shaps is not None:
|
|
1052
|
-
self._update_shap_values(enriched_shaps)
|
|
1053
|
+
self._update_shap_values(trace_id, validated_X.columns.to_list(), enriched_shaps)
|
|
1053
1054
|
|
|
1054
1055
|
if enriched_metric is None:
|
|
1055
1056
|
self.logger.warning(
|
|
@@ -1209,37 +1210,11 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1209
1210
|
finally:
|
|
1210
1211
|
self.logger.info(f"Calculating metrics elapsed time: {time.time() - start_time}")
|
|
1211
1212
|
|
|
1212
|
-
def _update_shap_values(self, new_shaps: Dict[str, float]):
|
|
1213
|
+
def _update_shap_values(self, trace_id: str, x_columns: List[str], new_shaps: Dict[str, float]):
|
|
1213
1214
|
new_shaps = {
|
|
1214
1215
|
feature: _round_shap_value(shap) for feature, shap in new_shaps.items() if feature in self.feature_names_
|
|
1215
1216
|
}
|
|
1216
|
-
|
|
1217
|
-
features_importances.sort(key=lambda m: (-m[1], m[0]))
|
|
1218
|
-
self.feature_names_, self.feature_importances_ = zip(*features_importances)
|
|
1219
|
-
self.feature_names_ = list(self.feature_names_)
|
|
1220
|
-
self.feature_importances_ = list(self.feature_importances_)
|
|
1221
|
-
|
|
1222
|
-
feature_name_header = self.bundle.get("features_info_name")
|
|
1223
|
-
shap_value_header = self.bundle.get("features_info_shap")
|
|
1224
|
-
|
|
1225
|
-
def update_shap(row):
|
|
1226
|
-
return new_shaps.get(row[feature_name_header], row[shap_value_header])
|
|
1227
|
-
|
|
1228
|
-
self.features_info[shap_value_header] = self.features_info.apply(update_shap, axis=1)
|
|
1229
|
-
self._internal_features_info[shap_value_header] = self._internal_features_info.apply(update_shap, axis=1)
|
|
1230
|
-
self._features_info_without_links[shap_value_header] = self._features_info_without_links.apply(
|
|
1231
|
-
update_shap, axis=1
|
|
1232
|
-
)
|
|
1233
|
-
self.logger.info(f"Recalculated SHAP values:\n{self._features_info_without_links}")
|
|
1234
|
-
|
|
1235
|
-
self.features_info.sort_values(by=shap_value_header, ascending=False, inplace=True)
|
|
1236
|
-
self._internal_features_info.sort_values(by=shap_value_header, ascending=False, inplace=True)
|
|
1237
|
-
self._features_info_without_links.sort_values(by=shap_value_header, ascending=False, inplace=True)
|
|
1238
|
-
|
|
1239
|
-
self.relevant_data_sources = self._group_relevant_data_sources(self.features_info, self.bundle)
|
|
1240
|
-
self._relevant_data_sources_wo_links = self._group_relevant_data_sources(
|
|
1241
|
-
self._features_info_without_links, self.bundle
|
|
1242
|
-
)
|
|
1217
|
+
self.__prepare_feature_importances(trace_id, x_columns, new_shaps, silent=True)
|
|
1243
1218
|
|
|
1244
1219
|
if self.features_info_display_handle is not None:
|
|
1245
1220
|
try:
|
|
@@ -1252,7 +1227,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1252
1227
|
display_handle=self.features_info_display_handle,
|
|
1253
1228
|
)
|
|
1254
1229
|
except (ImportError, NameError):
|
|
1255
|
-
|
|
1230
|
+
pass
|
|
1256
1231
|
if self.data_sources_display_handle is not None:
|
|
1257
1232
|
try:
|
|
1258
1233
|
_ = get_ipython() # type: ignore
|
|
@@ -1260,11 +1235,24 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1260
1235
|
display_html_dataframe(
|
|
1261
1236
|
self.relevant_data_sources,
|
|
1262
1237
|
self._relevant_data_sources_wo_links,
|
|
1263
|
-
self.bundle.get("
|
|
1238
|
+
self.bundle.get("relevant_data_sources_header"),
|
|
1264
1239
|
display_handle=self.data_sources_display_handle,
|
|
1265
1240
|
)
|
|
1266
1241
|
except (ImportError, NameError):
|
|
1267
|
-
|
|
1242
|
+
pass
|
|
1243
|
+
if self.autofe_features_display_handle is not None:
|
|
1244
|
+
try:
|
|
1245
|
+
_ = get_ipython() # type: ignore
|
|
1246
|
+
autofe_descriptions_df = self.get_autofe_features_description()
|
|
1247
|
+
if autofe_descriptions_df is not None:
|
|
1248
|
+
display_html_dataframe(
|
|
1249
|
+
df=autofe_descriptions_df,
|
|
1250
|
+
internal_df=autofe_descriptions_df,
|
|
1251
|
+
header=self.bundle.get("autofe_descriptions_header"),
|
|
1252
|
+
display_handle=self.autofe_features_display_handle,
|
|
1253
|
+
)
|
|
1254
|
+
except (ImportError, NameError):
|
|
1255
|
+
pass
|
|
1268
1256
|
if self.report_button_handle is not None:
|
|
1269
1257
|
try:
|
|
1270
1258
|
_ = get_ipython() # type: ignore
|
|
@@ -1448,7 +1436,11 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1448
1436
|
client_features = [
|
|
1449
1437
|
c
|
|
1450
1438
|
for c in X_sampled.columns.to_list()
|
|
1451
|
-
if (
|
|
1439
|
+
if (
|
|
1440
|
+
not self.select_features
|
|
1441
|
+
or c in self.feature_names_
|
|
1442
|
+
or (self.fit_columns_renaming is not None and self.fit_columns_renaming.get(c) in self.feature_names_)
|
|
1443
|
+
)
|
|
1452
1444
|
and c
|
|
1453
1445
|
not in (
|
|
1454
1446
|
excluding_search_keys
|
|
@@ -1665,7 +1657,10 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1665
1657
|
generated_features = []
|
|
1666
1658
|
if date_column is not None:
|
|
1667
1659
|
converter = DateTimeSearchKeyConverter(date_column, self.date_format, self.logger, self.bundle)
|
|
1668
|
-
|
|
1660
|
+
# Leave original date column values
|
|
1661
|
+
df_with_date_features = converter.convert(df, keep_time=True)
|
|
1662
|
+
df_with_date_features[date_column] = df[date_column]
|
|
1663
|
+
df = df_with_date_features
|
|
1669
1664
|
generated_features = converter.generated_features
|
|
1670
1665
|
|
|
1671
1666
|
email_columns = SearchKey.find_all_keys(search_keys, SearchKey.EMAIL)
|
|
@@ -1674,9 +1669,10 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1674
1669
|
df = generator.generate(df)
|
|
1675
1670
|
generated_features.extend(generator.generated_features)
|
|
1676
1671
|
|
|
1677
|
-
normalizer = Normalizer(self.bundle, self.logger)
|
|
1678
|
-
df, search_keys, generated_features = normalizer.normalize(df, search_keys, generated_features)
|
|
1679
|
-
columns_renaming = normalizer.columns_renaming
|
|
1672
|
+
# normalizer = Normalizer(self.bundle, self.logger)
|
|
1673
|
+
# df, search_keys, generated_features = normalizer.normalize(df, search_keys, generated_features)
|
|
1674
|
+
# columns_renaming = normalizer.columns_renaming
|
|
1675
|
+
columns_renaming = {c: c for c in df.columns}
|
|
1680
1676
|
|
|
1681
1677
|
df, _ = clean_full_duplicates(df, logger=self.logger, bundle=self.bundle)
|
|
1682
1678
|
|
|
@@ -1992,9 +1988,19 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1992
1988
|
file_metadata = self._search_task.get_file_metadata(str(uuid.uuid4()))
|
|
1993
1989
|
search_keys = file_metadata.search_types()
|
|
1994
1990
|
if SearchKey.IPV6_ADDRESS in search_keys:
|
|
1995
|
-
search_keys.remove(SearchKey.IPV6_ADDRESS)
|
|
1991
|
+
# search_keys.remove(SearchKey.IPV6_ADDRESS)
|
|
1992
|
+
search_keys.pop(SearchKey.IPV6_ADDRESS, None)
|
|
1996
1993
|
|
|
1997
|
-
keys =
|
|
1994
|
+
keys = (
|
|
1995
|
+
"{"
|
|
1996
|
+
+ ", ".join(
|
|
1997
|
+
[
|
|
1998
|
+
f'"{key.name}": {{"name": "{name}", "value": "{key_example(key)}"}}'
|
|
1999
|
+
for key, name in search_keys.items()
|
|
2000
|
+
]
|
|
2001
|
+
)
|
|
2002
|
+
+ "}"
|
|
2003
|
+
)
|
|
1998
2004
|
features_for_transform = self._search_task.get_features_for_transform()
|
|
1999
2005
|
if features_for_transform:
|
|
2000
2006
|
original_features_for_transform = [
|
|
@@ -2106,7 +2112,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2106
2112
|
date_column = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
|
|
2107
2113
|
if date_column is not None:
|
|
2108
2114
|
converter = DateTimeSearchKeyConverter(date_column, self.date_format, self.logger, bundle=self.bundle)
|
|
2109
|
-
df = converter.convert(df)
|
|
2115
|
+
df = converter.convert(df, keep_time=True)
|
|
2110
2116
|
self.logger.info(f"Date column after convertion: {df[date_column]}")
|
|
2111
2117
|
generated_features.extend(converter.generated_features)
|
|
2112
2118
|
else:
|
|
@@ -2201,11 +2207,12 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2201
2207
|
|
|
2202
2208
|
if add_fit_system_record_id:
|
|
2203
2209
|
df = self.__add_fit_system_record_id(df, search_keys, SYSTEM_RECORD_ID)
|
|
2204
|
-
if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
|
|
2205
|
-
df = df.drop(columns=DateTimeSearchKeyConverter.DATETIME_COL)
|
|
2206
2210
|
df = df.rename(columns={SYSTEM_RECORD_ID: SORT_ID})
|
|
2207
2211
|
features_not_to_pass.append(SORT_ID)
|
|
2208
2212
|
|
|
2213
|
+
if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
|
|
2214
|
+
df = df.drop(columns=DateTimeSearchKeyConverter.DATETIME_COL)
|
|
2215
|
+
|
|
2209
2216
|
# search keys might be changed after explode
|
|
2210
2217
|
columns_for_system_record_id = sorted(list(search_keys.keys()) + features_for_transform)
|
|
2211
2218
|
df[SYSTEM_RECORD_ID] = pd.util.hash_pandas_object(df[columns_for_system_record_id], index=False).astype(
|
|
@@ -2224,7 +2231,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2224
2231
|
|
|
2225
2232
|
combined_search_keys = combine_search_keys(search_keys.keys())
|
|
2226
2233
|
|
|
2227
|
-
df_without_features = df.drop(columns=features_not_to_pass)
|
|
2234
|
+
df_without_features = df.drop(columns=features_not_to_pass, errors="ignore")
|
|
2228
2235
|
|
|
2229
2236
|
df_without_features, full_duplicates_warning = clean_full_duplicates(
|
|
2230
2237
|
df_without_features, self.logger, bundle=self.bundle
|
|
@@ -2339,7 +2346,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2339
2346
|
if c not in self.dropped_client_feature_names_
|
|
2340
2347
|
]
|
|
2341
2348
|
filtered_columns = self.__filtered_enriched_features(importance_threshold, max_features)
|
|
2342
|
-
selecting_columns.extend(
|
|
2349
|
+
selecting_columns.extend(
|
|
2350
|
+
c for c in filtered_columns if c in result.columns and c not in validated_X.columns
|
|
2351
|
+
)
|
|
2343
2352
|
if add_fit_system_record_id:
|
|
2344
2353
|
selecting_columns.append(SORT_ID)
|
|
2345
2354
|
|
|
@@ -2794,7 +2803,12 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2794
2803
|
autofe_description = self.get_autofe_features_description()
|
|
2795
2804
|
if autofe_description is not None:
|
|
2796
2805
|
self.logger.info(f"AutoFE descriptions: {autofe_description}")
|
|
2797
|
-
display_html_dataframe(
|
|
2806
|
+
self.autofe_features_display_handle = display_html_dataframe(
|
|
2807
|
+
df=autofe_description,
|
|
2808
|
+
internal_df=autofe_description,
|
|
2809
|
+
header=self.bundle.get("autofe_descriptions_header"),
|
|
2810
|
+
display_id="autofe_descriptions",
|
|
2811
|
+
)
|
|
2798
2812
|
|
|
2799
2813
|
if self._has_paid_features(exclude_features_sources):
|
|
2800
2814
|
if calculate_metrics is not None and calculate_metrics:
|
|
@@ -3506,7 +3520,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3506
3520
|
|
|
3507
3521
|
return result_train, result_eval_sets
|
|
3508
3522
|
|
|
3509
|
-
def __prepare_feature_importances(
|
|
3523
|
+
def __prepare_feature_importances(
|
|
3524
|
+
self, trace_id: str, x_columns: List[str], updated_shaps: Optional[Dict[str, float]] = None, silent=False
|
|
3525
|
+
):
|
|
3510
3526
|
if self._search_task is None:
|
|
3511
3527
|
raise NotFittedError(self.bundle.get("transform_unfitted_enricher"))
|
|
3512
3528
|
features_meta = self._search_task.get_all_features_metadata_v2()
|
|
@@ -3523,6 +3539,10 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3523
3539
|
features_info_without_links = []
|
|
3524
3540
|
internal_features_info = []
|
|
3525
3541
|
|
|
3542
|
+
if updated_shaps is not None:
|
|
3543
|
+
for fm in features_meta:
|
|
3544
|
+
fm.shap_value = updated_shaps.get(fm.name, 0.0)
|
|
3545
|
+
|
|
3526
3546
|
features_meta.sort(key=lambda m: (-m.shap_value, m.name))
|
|
3527
3547
|
for feature_meta in features_meta:
|
|
3528
3548
|
if feature_meta.name in original_names_dict.keys():
|
|
@@ -3544,7 +3564,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3544
3564
|
):
|
|
3545
3565
|
continue
|
|
3546
3566
|
|
|
3547
|
-
|
|
3548
3567
|
self.feature_names_.append(feature_meta.name)
|
|
3549
3568
|
self.feature_importances_.append(_round_shap_value(feature_meta.shap_value))
|
|
3550
3569
|
|
|
@@ -3576,7 +3595,22 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3576
3595
|
autofe_meta = self._search_task.get_autofe_metadata()
|
|
3577
3596
|
if autofe_meta is None:
|
|
3578
3597
|
return None
|
|
3579
|
-
|
|
3598
|
+
if len(self._internal_features_info) != 0:
|
|
3599
|
+
|
|
3600
|
+
def to_feature_meta(row):
|
|
3601
|
+
fm = FeaturesMetadataV2(
|
|
3602
|
+
name=row[bundle.get("features_info_name")],
|
|
3603
|
+
type="",
|
|
3604
|
+
source="",
|
|
3605
|
+
hit_rate=row[bundle.get("features_info_hitrate")],
|
|
3606
|
+
shap_value=row[bundle.get("features_info_shap")],
|
|
3607
|
+
data_source=row[bundle.get("features_info_source")],
|
|
3608
|
+
)
|
|
3609
|
+
return fm
|
|
3610
|
+
|
|
3611
|
+
features_meta = self._internal_features_info.apply(to_feature_meta, axis=1).to_list()
|
|
3612
|
+
else:
|
|
3613
|
+
features_meta = self._search_task.get_all_features_metadata_v2()
|
|
3580
3614
|
|
|
3581
3615
|
def get_feature_by_name(name: str):
|
|
3582
3616
|
for m in features_meta:
|
|
@@ -3605,27 +3639,32 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3605
3639
|
self.logger.warning(f"Feature meta for display index {m.display_index} not found")
|
|
3606
3640
|
continue
|
|
3607
3641
|
description["shap"] = feature_meta.shap_value
|
|
3608
|
-
description["
|
|
3609
|
-
"AutoFE:
|
|
3610
|
-
)
|
|
3611
|
-
description["
|
|
3642
|
+
description[self.bundle.get("autofe_descriptions_sources")] = feature_meta.data_source.replace(
|
|
3643
|
+
"AutoFE: features from ", ""
|
|
3644
|
+
).replace("AutoFE: feature from ", "")
|
|
3645
|
+
description[self.bundle.get("autofe_descriptions_feature_name")] = feature_meta.name
|
|
3612
3646
|
|
|
3613
3647
|
feature_idx = 1
|
|
3614
3648
|
for bc in m.base_columns:
|
|
3615
|
-
description[
|
|
3649
|
+
description[self.bundle.get("autofe_descriptions_feature").format(feature_idx)] = bc.hashed_name
|
|
3616
3650
|
feature_idx += 1
|
|
3617
3651
|
|
|
3618
|
-
description["
|
|
3652
|
+
description[self.bundle.get("autofe_descriptions_function")] = ",".join(
|
|
3653
|
+
sorted(autofe_feature.get_all_operand_names())
|
|
3654
|
+
)
|
|
3619
3655
|
|
|
3620
3656
|
descriptions.append(description)
|
|
3621
3657
|
|
|
3622
3658
|
if len(descriptions) == 0:
|
|
3623
3659
|
return None
|
|
3624
3660
|
|
|
3625
|
-
descriptions_df =
|
|
3626
|
-
|
|
3627
|
-
|
|
3628
|
-
|
|
3661
|
+
descriptions_df = (
|
|
3662
|
+
pd.DataFrame(descriptions)
|
|
3663
|
+
.fillna("")
|
|
3664
|
+
.sort_values(by="shap", ascending=False)
|
|
3665
|
+
.drop(columns="shap")
|
|
3666
|
+
.reset_index(drop=True)
|
|
3667
|
+
)
|
|
3629
3668
|
return descriptions_df
|
|
3630
3669
|
|
|
3631
3670
|
except Exception:
|
upgini/http.py
CHANGED
|
@@ -882,7 +882,7 @@ class _RestClient:
|
|
|
882
882
|
if content_type:
|
|
883
883
|
headers[_RestClient.CONTENT_TYPE_HEADER_NAME] = content_type
|
|
884
884
|
if trace_id:
|
|
885
|
-
headers[_RestClient.TRACE_ID_HEADER_NAME] = trace_id
|
|
885
|
+
headers[_RestClient.TRACE_ID_HEADER_NAME] = str(trace_id)
|
|
886
886
|
for header_key, header_value in additional_headers.items():
|
|
887
887
|
headers[header_key] = header_value
|
|
888
888
|
return headers
|
upgini/metadata.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
from enum import Enum
|
|
4
|
-
from typing import Any, Dict, List, Optional,
|
|
4
|
+
from typing import Any, Dict, List, Optional, Union
|
|
5
5
|
|
|
6
6
|
from pydantic import BaseModel
|
|
7
7
|
|
|
@@ -228,13 +228,13 @@ class FileMetadata(BaseModel):
|
|
|
228
228
|
return c
|
|
229
229
|
return None
|
|
230
230
|
|
|
231
|
-
def search_types(self) ->
|
|
232
|
-
search_keys =
|
|
231
|
+
def search_types(self) -> Dict[SearchKey, str]:
|
|
232
|
+
search_keys = dict()
|
|
233
233
|
for keys_group in self.searchKeys:
|
|
234
234
|
for key in keys_group:
|
|
235
235
|
column = self.column_by_name(key)
|
|
236
236
|
if column:
|
|
237
|
-
search_keys
|
|
237
|
+
search_keys[SearchKey.from_meaning_type(column.meaningType)] = column.name
|
|
238
238
|
return search_keys
|
|
239
239
|
|
|
240
240
|
|
|
@@ -251,6 +251,14 @@ relevant_data_sources_header=Relevant data sources
|
|
|
251
251
|
relevant_data_sources_all_shap=All features SHAP
|
|
252
252
|
relevant_data_sources_number=Number of relevant features
|
|
253
253
|
|
|
254
|
+
# Autofe descriptions
|
|
255
|
+
autofe_descriptions_header=*Description of AutoFE feature names
|
|
256
|
+
autofe_descriptions_sources=Sources
|
|
257
|
+
autofe_descriptions_feature_name=Feature name
|
|
258
|
+
autofe_descriptions_feature=Feature {}
|
|
259
|
+
autofe_descriptions_function=Function
|
|
260
|
+
|
|
261
|
+
|
|
254
262
|
# Quality metrics table
|
|
255
263
|
quality_metrics_header=Accuracy after enrichment
|
|
256
264
|
quality_metrics_train_segment=Train
|
upgini/utils/datetime_utils.py
CHANGED
|
@@ -114,10 +114,12 @@ class DateTimeSearchKeyConverter:
|
|
|
114
114
|
period_suffix = f"_{period}" if column != "day_in_quarter" else ""
|
|
115
115
|
sin_feature = f"datetime_{column}_sin{period_suffix}"
|
|
116
116
|
cos_feature = f"datetime_{column}_cos{period_suffix}"
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
117
|
+
if sin_feature not in df.columns:
|
|
118
|
+
df[sin_feature] = np.sin(2 * np.pi * df[column] / period)
|
|
119
|
+
self.generated_features.append(sin_feature)
|
|
120
|
+
if cos_feature not in df.columns:
|
|
121
|
+
df[cos_feature] = np.cos(2 * np.pi * df[column] / period)
|
|
122
|
+
self.generated_features.append(cos_feature)
|
|
121
123
|
|
|
122
124
|
df["quarter"] = df[self.date_column].dt.quarter
|
|
123
125
|
|
upgini/utils/email_utils.py
CHANGED
|
@@ -38,8 +38,9 @@ class EmailDomainGenerator:
|
|
|
38
38
|
def generate(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
39
39
|
for email_col in self.email_columns:
|
|
40
40
|
domain_feature = email_col + self.DOMAIN_SUFFIX
|
|
41
|
-
|
|
42
|
-
|
|
41
|
+
if domain_feature not in df.columns:
|
|
42
|
+
df[domain_feature] = df[email_col].apply(self._email_to_domain).astype("string")
|
|
43
|
+
self.generated_features.append(domain_feature)
|
|
43
44
|
return df
|
|
44
45
|
|
|
45
46
|
@staticmethod
|
|
@@ -2,6 +2,7 @@ import logging
|
|
|
2
2
|
from logging import Logger
|
|
3
3
|
from typing import Dict, List, Optional, Tuple
|
|
4
4
|
|
|
5
|
+
import numpy as np
|
|
5
6
|
import pandas as pd
|
|
6
7
|
from pandas.api.types import is_integer_dtype, is_object_dtype, is_string_dtype
|
|
7
8
|
|
|
@@ -83,10 +84,21 @@ class FeaturesValidator:
|
|
|
83
84
|
return [
|
|
84
85
|
i
|
|
85
86
|
for i in df
|
|
86
|
-
if (is_object_dtype(df[i]) or is_string_dtype(df[i]) or
|
|
87
|
+
if (is_object_dtype(df[i]) or is_string_dtype(df[i]) or FeaturesValidator.__is_integer(df[i]))
|
|
87
88
|
and (df[i].nunique(dropna=False) / row_count >= 0.85)
|
|
88
89
|
]
|
|
89
90
|
|
|
91
|
+
@staticmethod
|
|
92
|
+
def __is_integer(series: pd.Series) -> bool:
|
|
93
|
+
return (
|
|
94
|
+
is_integer_dtype(series)
|
|
95
|
+
or series.dropna()
|
|
96
|
+
.apply(
|
|
97
|
+
lambda f: (float.is_integer(f) and abs(f) < np.iinfo(np.int64).max) if isinstance(f, float) else False
|
|
98
|
+
)
|
|
99
|
+
.all()
|
|
100
|
+
)
|
|
101
|
+
|
|
90
102
|
@staticmethod
|
|
91
103
|
def find_constant_features(df: pd.DataFrame) -> List[str]:
|
|
92
104
|
return [i for i in df if df[i].nunique() <= 1]
|
|
@@ -1,12 +1,12 @@
|
|
|
1
|
-
upgini/__about__.py,sha256=
|
|
1
|
+
upgini/__about__.py,sha256=WGj1CvVJizDkAvd9BtLpwcsI-hzacJoXGbC8sVpoHYk,23
|
|
2
2
|
upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
|
|
3
3
|
upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
|
|
4
4
|
upgini/dataset.py,sha256=iPFiMJtk4HF1ytw9wCQr8H9RfoOKj_TIo8XYZKWgcMc,31331
|
|
5
5
|
upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
|
|
6
|
-
upgini/features_enricher.py,sha256=
|
|
7
|
-
upgini/http.py,sha256=
|
|
6
|
+
upgini/features_enricher.py,sha256=lNfu5Z40NmkkGJScKAwe_0VBtL8liePifuAlKE_flfA,192053
|
|
7
|
+
upgini/http.py,sha256=plZGTGoi1h2edd8Cnjt4eYB8t4NbBGnZz7DtPTByiNc,42885
|
|
8
8
|
upgini/lazy_import.py,sha256=74gQ8JuA48BGRLxAo7lNHNKY2D2emMxrUxKGdxVGhuY,1012
|
|
9
|
-
upgini/metadata.py,sha256=
|
|
9
|
+
upgini/metadata.py,sha256=lUa2xYhBhnCeTqNt6lWc9iP_YuikYGIsDSn8Vwyjv1I,11235
|
|
10
10
|
upgini/metrics.py,sha256=hr7UwLphbZ_FEglLuO2lzr_pFgxOJ4c3WBeg7H-fNqY,35521
|
|
11
11
|
upgini/search_task.py,sha256=qxUxAD-bed-FpZYmTB_4orW7YJsW_O6a1TcgnZIRFr4,17307
|
|
12
12
|
upgini/spinner.py,sha256=4iMd-eIe_BnkqFEMIliULTbj6rNI2HkN_VJ4qYe0cUc,1118
|
|
@@ -30,7 +30,7 @@ upgini/normalizer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU
|
|
|
30
30
|
upgini/normalizer/normalize_utils.py,sha256=Ft2MwSgVoBilXAORAOYAuwPD79GOLfwn4qQE3IUFzzg,7218
|
|
31
31
|
upgini/resource_bundle/__init__.py,sha256=S5F2G47pnJd2LDpmFsjDqEwiKkP8Hm-hcseDbMka6Ko,8345
|
|
32
32
|
upgini/resource_bundle/exceptions.py,sha256=5fRvx0_vWdE1-7HcSgF0tckB4A9AKyf5RiinZkInTsI,621
|
|
33
|
-
upgini/resource_bundle/strings.properties,sha256=
|
|
33
|
+
upgini/resource_bundle/strings.properties,sha256=bKw_rjZZTomLJhQBqiM7_P2EoRq45_Ng2gP4WE6MRBE,26921
|
|
34
34
|
upgini/resource_bundle/strings_widget.properties,sha256=gOdqvZWntP2LCza_tyVk1_yRYcG4c04K9sQOAVhF_gw,1577
|
|
35
35
|
upgini/sampler/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
36
36
|
upgini/sampler/base.py,sha256=7GpjYqjOp58vYcJLiX__1R5wjUlyQbxvHJ2klFnup_M,6389
|
|
@@ -43,13 +43,13 @@ upgini/utils/blocked_time_series.py,sha256=Uqr3vp4YqNclj2-PzEYqVy763GSXHn86sbpIl
|
|
|
43
43
|
upgini/utils/country_utils.py,sha256=lY-eXWwFVegdVENFttbvLcgGDjFO17Sex8hd2PyJaRk,6937
|
|
44
44
|
upgini/utils/custom_loss_utils.py,sha256=kieNZYBYZm5ZGBltF1F_jOSF4ea6C29rYuCyiDcqVNY,3857
|
|
45
45
|
upgini/utils/cv_utils.py,sha256=w6FQb9nO8BWDx88EF83NpjPLarK4eR4ia0Wg0kLBJC4,3525
|
|
46
|
-
upgini/utils/datetime_utils.py,sha256=
|
|
46
|
+
upgini/utils/datetime_utils.py,sha256=F61i2vZCB6eUy4WwodDyPi50XKPbhOHsxDrU6tGa6CM,13133
|
|
47
47
|
upgini/utils/deduplicate_utils.py,sha256=SMZx9IKIhWI5HqXepfKiQb3uDJrogQZtG6jcWuMo5Z4,8855
|
|
48
48
|
upgini/utils/display_utils.py,sha256=DsBjJ8jEYAh8BPgfAbzq5imoGFV6IACP20PQ78BQCX0,11964
|
|
49
|
-
upgini/utils/email_utils.py,sha256=
|
|
49
|
+
upgini/utils/email_utils.py,sha256=GbnhHJn1nhUBytmK6PophYqaoq4t7Lp6i0-O0Gd3RV8,5265
|
|
50
50
|
upgini/utils/fallback_progress_bar.py,sha256=PDaKb8dYpVZaWMroNcOHsTc3pSjgi9mOm0--cOFTwJ0,1074
|
|
51
51
|
upgini/utils/feature_info.py,sha256=Tp_2g5-rCjY4NpzKhzxwNxuqH5FFL8vG94OU5kH6wzk,6702
|
|
52
|
-
upgini/utils/features_validator.py,sha256=
|
|
52
|
+
upgini/utils/features_validator.py,sha256=lEfmk4DoxZ4ooOE1HC0ZXtUb_lFKRFHIrnFULZ4_rL8,3746
|
|
53
53
|
upgini/utils/format.py,sha256=Yv5cvvSs2bOLUzzNu96Pu33VMDNbabio92QepUj41jU,243
|
|
54
54
|
upgini/utils/ip_utils.py,sha256=Q6vb7Sr5Khx3Sq3eENjW2qCXKej_S5jZbneH6zEOkzQ,5171
|
|
55
55
|
upgini/utils/phone_utils.py,sha256=IrbztLuOJBiePqqxllfABWfYlfAjYevPhXKipl95wUI,10432
|
|
@@ -59,7 +59,7 @@ upgini/utils/sklearn_ext.py,sha256=13jQS_k7v0aUtudXV6nGUEWjttPQzAW9AFYL5wgEz9k,4
|
|
|
59
59
|
upgini/utils/target_utils.py,sha256=PU77nIhTz7IHbC4rpTpxrVxib6cdpRL9F1dhkjIffLY,10225
|
|
60
60
|
upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
|
|
61
61
|
upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
|
|
62
|
-
upgini-1.2.
|
|
63
|
-
upgini-1.2.
|
|
64
|
-
upgini-1.2.
|
|
65
|
-
upgini-1.2.
|
|
62
|
+
upgini-1.2.30.dist-info/METADATA,sha256=93iMDL28nXF2DJaDP-oUS2CeCFHgnjk2zmAhge2LAHg,48578
|
|
63
|
+
upgini-1.2.30.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
|
|
64
|
+
upgini-1.2.30.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
|
|
65
|
+
upgini-1.2.30.dist-info/RECORD,,
|
|
File without changes
|