upgini 1.1.278a2__py3-none-any.whl → 1.1.279__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- upgini/__about__.py +1 -0
- upgini/ads_management/ads_manager.py +4 -2
- upgini/autofe/all_operands.py +3 -2
- upgini/autofe/binary.py +2 -1
- upgini/autofe/date.py +2 -1
- upgini/autofe/feature.py +1 -1
- upgini/autofe/groupby.py +3 -1
- upgini/autofe/operand.py +4 -3
- upgini/autofe/unary.py +2 -1
- upgini/autofe/vector.py +2 -0
- upgini/dataset.py +6 -15
- upgini/errors.py +1 -1
- upgini/features_enricher.py +104 -217
- upgini/http.py +11 -10
- upgini/mdc/__init__.py +1 -3
- upgini/mdc/context.py +4 -6
- upgini/metadata.py +5 -10
- upgini/metrics.py +102 -100
- upgini/normalizer/phone_normalizer.py +1 -1
- upgini/resource_bundle/__init__.py +5 -5
- upgini/resource_bundle/strings.properties +0 -1
- upgini/sampler/base.py +1 -4
- upgini/sampler/random_under_sampler.py +2 -5
- upgini/search_task.py +4 -4
- upgini/spinner.py +1 -1
- upgini/utils/__init__.py +1 -1
- upgini/utils/base_search_key_detector.py +14 -16
- upgini/utils/blocked_time_series.py +4 -2
- upgini/utils/country_utils.py +1 -1
- upgini/utils/custom_loss_utils.py +3 -2
- upgini/utils/cv_utils.py +2 -2
- upgini/utils/datetime_utils.py +20 -15
- upgini/utils/deduplicate_utils.py +1 -11
- upgini/utils/email_utils.py +2 -7
- upgini/utils/fallback_progress_bar.py +1 -1
- upgini/utils/progress_bar.py +1 -1
- upgini/utils/sklearn_ext.py +14 -13
- upgini/utils/track_info.py +2 -2
- upgini/version_validator.py +2 -2
- {upgini-1.1.278a2.dist-info → upgini-1.1.279.dist-info}/METADATA +21 -23
- upgini-1.1.279.dist-info/RECORD +62 -0
- {upgini-1.1.278a2.dist-info → upgini-1.1.279.dist-info}/WHEEL +1 -2
- upgini-1.1.278a2.dist-info/RECORD +0 -62
- upgini-1.1.278a2.dist-info/top_level.txt +0 -1
- {upgini-1.1.278a2.dist-info → upgini-1.1.279.dist-info/licenses}/LICENSE +0 -0
upgini/features_enricher.py
CHANGED
|
@@ -11,7 +11,6 @@ import sys
|
|
|
11
11
|
import tempfile
|
|
12
12
|
import time
|
|
13
13
|
import uuid
|
|
14
|
-
from collections import Counter
|
|
15
14
|
from dataclasses import dataclass
|
|
16
15
|
from threading import Thread
|
|
17
16
|
from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union
|
|
@@ -46,11 +45,9 @@ from upgini.mdc import MDC
|
|
|
46
45
|
from upgini.metadata import (
|
|
47
46
|
COUNTRY,
|
|
48
47
|
DEFAULT_INDEX,
|
|
49
|
-
ENTITY_SYSTEM_RECORD_ID,
|
|
50
48
|
EVAL_SET_INDEX,
|
|
51
49
|
ORIGINAL_INDEX,
|
|
52
50
|
RENAMED_INDEX,
|
|
53
|
-
SEARCH_KEY_UNNEST,
|
|
54
51
|
SORT_ID,
|
|
55
52
|
SYSTEM_RECORD_ID,
|
|
56
53
|
TARGET,
|
|
@@ -251,7 +248,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
251
248
|
self.__cached_sampled_datasets: Optional[Tuple[pd.DataFrame, pd.DataFrame, pd.Series, Dict, Dict]] = None
|
|
252
249
|
|
|
253
250
|
validate_version(self.logger)
|
|
254
|
-
self.search_keys = search_keys or
|
|
251
|
+
self.search_keys = search_keys or dict()
|
|
255
252
|
self.country_code = country_code
|
|
256
253
|
self.__validate_search_keys(search_keys, search_id)
|
|
257
254
|
self.model_task_type = model_task_type
|
|
@@ -1191,7 +1188,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1191
1188
|
email_column = self._get_email_column(search_keys)
|
|
1192
1189
|
hem_column = self._get_hem_column(search_keys)
|
|
1193
1190
|
if email_column:
|
|
1194
|
-
converter = EmailSearchKeyConverter(email_column, hem_column, search_keys,
|
|
1191
|
+
converter = EmailSearchKeyConverter(email_column, hem_column, search_keys, self.logger)
|
|
1195
1192
|
extended_X = converter.convert(extended_X)
|
|
1196
1193
|
generated_features.extend(converter.generated_features)
|
|
1197
1194
|
if (
|
|
@@ -1343,7 +1340,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1343
1340
|
not in (
|
|
1344
1341
|
excluding_search_keys
|
|
1345
1342
|
+ list(self.fit_dropped_features)
|
|
1346
|
-
+ [DateTimeSearchKeyConverter.DATETIME_COL, SYSTEM_RECORD_ID
|
|
1343
|
+
+ [DateTimeSearchKeyConverter.DATETIME_COL, SYSTEM_RECORD_ID]
|
|
1347
1344
|
)
|
|
1348
1345
|
]
|
|
1349
1346
|
|
|
@@ -1407,7 +1404,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1407
1404
|
fitting_enriched_X[col].astype("string").str.replace(",", ".").astype(np.float64)
|
|
1408
1405
|
)
|
|
1409
1406
|
|
|
1410
|
-
fitting_eval_set_dict =
|
|
1407
|
+
fitting_eval_set_dict = dict()
|
|
1411
1408
|
for idx, eval_tuple in eval_set_sampled_dict.items():
|
|
1412
1409
|
eval_X_sampled, enriched_eval_X, eval_y_sampled = eval_tuple
|
|
1413
1410
|
eval_X_sorted, eval_y_sorted = self._sort_by_system_record_id(eval_X_sampled, eval_y_sampled, self.cv)
|
|
@@ -1519,7 +1516,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1519
1516
|
def __sample_only_input(
|
|
1520
1517
|
self, validated_X: pd.DataFrame, validated_y: pd.Series, eval_set: Optional[List[tuple]], is_demo_dataset: bool
|
|
1521
1518
|
) -> _SampledDataForMetrics:
|
|
1522
|
-
eval_set_sampled_dict =
|
|
1519
|
+
eval_set_sampled_dict = dict()
|
|
1523
1520
|
|
|
1524
1521
|
df = validated_X.copy()
|
|
1525
1522
|
df[TARGET] = validated_y
|
|
@@ -1545,7 +1542,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1545
1542
|
df = df.sample(n=sample_rows, random_state=self.random_state)
|
|
1546
1543
|
|
|
1547
1544
|
df_extended, search_keys = self._extend_x(df, is_demo_dataset)
|
|
1548
|
-
df_extended = self.__add_fit_system_record_id(df_extended,
|
|
1545
|
+
df_extended = self.__add_fit_system_record_id(df_extended, dict(), search_keys)
|
|
1549
1546
|
|
|
1550
1547
|
train_df = df_extended.query(f"{EVAL_SET_INDEX} == 0") if eval_set is not None else df_extended
|
|
1551
1548
|
X_sampled = train_df.drop(columns=[TARGET, EVAL_SET_INDEX], errors="ignore")
|
|
@@ -1569,7 +1566,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1569
1566
|
trace_id: str,
|
|
1570
1567
|
remove_outliers_calc_metrics: Optional[bool],
|
|
1571
1568
|
) -> _SampledDataForMetrics:
|
|
1572
|
-
eval_set_sampled_dict =
|
|
1569
|
+
eval_set_sampled_dict = dict()
|
|
1573
1570
|
search_keys = self.fit_search_keys
|
|
1574
1571
|
|
|
1575
1572
|
rows_to_drop = None
|
|
@@ -1643,7 +1640,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1643
1640
|
progress_bar: Optional[ProgressBar],
|
|
1644
1641
|
progress_callback: Optional[Callable[[SearchProgress], Any]],
|
|
1645
1642
|
) -> _SampledDataForMetrics:
|
|
1646
|
-
eval_set_sampled_dict =
|
|
1643
|
+
eval_set_sampled_dict = dict()
|
|
1647
1644
|
if eval_set is not None:
|
|
1648
1645
|
self.logger.info("Transform with eval_set")
|
|
1649
1646
|
# concatenate X and eval_set with eval_set_index
|
|
@@ -1665,7 +1662,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1665
1662
|
self.logger.info(f"Downsampling from {num_samples} to {Dataset.FIT_SAMPLE_WITH_EVAL_SET_ROWS}")
|
|
1666
1663
|
df = df.sample(n=Dataset.FIT_SAMPLE_WITH_EVAL_SET_ROWS, random_state=self.random_state)
|
|
1667
1664
|
|
|
1668
|
-
eval_set_sampled_dict =
|
|
1665
|
+
eval_set_sampled_dict = dict()
|
|
1669
1666
|
|
|
1670
1667
|
tmp_target_name = "__target"
|
|
1671
1668
|
df = df.rename(columns={TARGET: tmp_target_name})
|
|
@@ -1928,38 +1925,11 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1928
1925
|
self.logger.info("Input dataset hasn't date column")
|
|
1929
1926
|
if self.add_date_if_missing:
|
|
1930
1927
|
df = self._add_current_date_as_key(df, search_keys, self.logger, self.bundle)
|
|
1931
|
-
|
|
1932
|
-
# Don't pass all features in backend on transform
|
|
1933
|
-
original_features_for_transform = []
|
|
1934
|
-
runtime_parameters = self._get_copy_of_runtime_parameters()
|
|
1935
|
-
features_not_to_pass = [column for column in df.columns if column not in search_keys.keys()]
|
|
1936
|
-
if len(features_not_to_pass) > 0:
|
|
1937
|
-
# Pass only features that need for transform
|
|
1938
|
-
features_for_transform = self._search_task.get_features_for_transform()
|
|
1939
|
-
if features_for_transform is not None and len(features_for_transform) > 0:
|
|
1940
|
-
file_metadata = self._search_task.get_file_metadata(trace_id)
|
|
1941
|
-
original_features_for_transform = [
|
|
1942
|
-
c.originalName or c.name for c in file_metadata.columns if c.name in features_for_transform
|
|
1943
|
-
]
|
|
1944
|
-
|
|
1945
|
-
runtime_parameters.properties["features_for_embeddings"] = ",".join(features_for_transform)
|
|
1946
|
-
|
|
1947
|
-
columns_for_system_record_id = sorted(list(search_keys.keys()) + (original_features_for_transform))
|
|
1948
|
-
|
|
1949
|
-
df[ENTITY_SYSTEM_RECORD_ID] = pd.util.hash_pandas_object(
|
|
1950
|
-
df[columns_for_system_record_id], index=False
|
|
1951
|
-
).astype("Float64")
|
|
1952
|
-
|
|
1953
|
-
# Explode multiple search keys
|
|
1954
|
-
df, unnest_search_keys = self._explode_multiple_search_keys(df, search_keys)
|
|
1955
|
-
|
|
1956
1928
|
email_column = self._get_email_column(search_keys)
|
|
1957
1929
|
hem_column = self._get_hem_column(search_keys)
|
|
1958
1930
|
email_converted_to_hem = False
|
|
1959
1931
|
if email_column:
|
|
1960
|
-
converter = EmailSearchKeyConverter(
|
|
1961
|
-
email_column, hem_column, search_keys, list(unnest_search_keys.keys()), self.logger
|
|
1962
|
-
)
|
|
1932
|
+
converter = EmailSearchKeyConverter(email_column, hem_column, search_keys, self.logger)
|
|
1963
1933
|
df = converter.convert(df)
|
|
1964
1934
|
generated_features.extend(converter.generated_features)
|
|
1965
1935
|
email_converted_to_hem = converter.email_converted_to_hem
|
|
@@ -1973,21 +1943,30 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1973
1943
|
generated_features = [f for f in generated_features if f in self.fit_generated_features]
|
|
1974
1944
|
|
|
1975
1945
|
meaning_types = {col: key.value for col, key in search_keys.items()}
|
|
1976
|
-
|
|
1977
|
-
for col in original_features_for_transform:
|
|
1978
|
-
meaning_types[col] = FileColumnMeaningType.FEATURE
|
|
1979
|
-
features_not_to_pass = [column for column in features_not_to_pass if column not in search_keys.keys()]
|
|
1946
|
+
non_keys_columns = [column for column in df.columns if column not in search_keys.keys()]
|
|
1980
1947
|
|
|
1981
1948
|
if email_converted_to_hem:
|
|
1982
|
-
|
|
1949
|
+
non_keys_columns.append(email_column)
|
|
1950
|
+
|
|
1951
|
+
# Don't pass features in backend on transform
|
|
1952
|
+
original_features_for_transform = None
|
|
1953
|
+
runtime_parameters = self._get_copy_of_runtime_parameters()
|
|
1954
|
+
if len(non_keys_columns) > 0:
|
|
1955
|
+
# Pass only features that need for transform
|
|
1956
|
+
features_for_transform = self._search_task.get_features_for_transform()
|
|
1957
|
+
if features_for_transform is not None and len(features_for_transform) > 0:
|
|
1958
|
+
file_metadata = self._search_task.get_file_metadata(trace_id)
|
|
1959
|
+
original_features_for_transform = [
|
|
1960
|
+
c.originalName or c.name for c in file_metadata.columns if c.name in features_for_transform
|
|
1961
|
+
]
|
|
1962
|
+
non_keys_columns = [c for c in non_keys_columns if c not in original_features_for_transform]
|
|
1983
1963
|
|
|
1984
|
-
|
|
1985
|
-
columns_for_system_record_id = sorted(list(search_keys.keys()) + (original_features_for_transform))
|
|
1964
|
+
runtime_parameters.properties["features_for_embeddings"] = ",".join(features_for_transform)
|
|
1986
1965
|
|
|
1987
1966
|
if add_fit_system_record_id:
|
|
1988
|
-
df = self.__add_fit_system_record_id(df,
|
|
1967
|
+
df = self.__add_fit_system_record_id(df, dict(), search_keys)
|
|
1989
1968
|
df = df.rename(columns={SYSTEM_RECORD_ID: SORT_ID})
|
|
1990
|
-
|
|
1969
|
+
non_keys_columns.append(SORT_ID)
|
|
1991
1970
|
|
|
1992
1971
|
columns_for_system_record_id = sorted(list(search_keys.keys()) + (original_features_for_transform or []))
|
|
1993
1972
|
|
|
@@ -1995,19 +1974,16 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1995
1974
|
"Float64"
|
|
1996
1975
|
)
|
|
1997
1976
|
meaning_types[SYSTEM_RECORD_ID] = FileColumnMeaningType.SYSTEM_RECORD_ID
|
|
1998
|
-
meaning_types[ENTITY_SYSTEM_RECORD_ID] = FileColumnMeaningType.ENTITY_SYSTEM_RECORD_ID
|
|
1999
|
-
if SEARCH_KEY_UNNEST in df.columns:
|
|
2000
|
-
meaning_types[SEARCH_KEY_UNNEST] = FileColumnMeaningType.UNNEST_KEY
|
|
2001
1977
|
|
|
2002
1978
|
df = df.reset_index(drop=True)
|
|
2003
|
-
system_columns_with_original_index = [SYSTEM_RECORD_ID
|
|
1979
|
+
system_columns_with_original_index = [SYSTEM_RECORD_ID] + generated_features
|
|
2004
1980
|
if add_fit_system_record_id:
|
|
2005
1981
|
system_columns_with_original_index.append(SORT_ID)
|
|
2006
1982
|
df_with_original_index = df[system_columns_with_original_index].copy()
|
|
2007
1983
|
|
|
2008
1984
|
combined_search_keys = combine_search_keys(search_keys.keys())
|
|
2009
1985
|
|
|
2010
|
-
df_without_features = df.drop(columns=
|
|
1986
|
+
df_without_features = df.drop(columns=non_keys_columns)
|
|
2011
1987
|
|
|
2012
1988
|
df_without_features = clean_full_duplicates(
|
|
2013
1989
|
df_without_features, self.logger, silent=silent_mode, bundle=self.bundle
|
|
@@ -2019,13 +1995,12 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2019
1995
|
dataset = Dataset(
|
|
2020
1996
|
"sample_" + str(uuid.uuid4()),
|
|
2021
1997
|
df=df_without_features,
|
|
2022
|
-
meaning_types=meaning_types,
|
|
2023
|
-
search_keys=combined_search_keys,
|
|
2024
|
-
unnest_search_keys=unnest_search_keys,
|
|
2025
1998
|
date_format=self.date_format,
|
|
2026
1999
|
rest_client=self.rest_client,
|
|
2027
2000
|
logger=self.logger,
|
|
2028
2001
|
)
|
|
2002
|
+
dataset.meaning_types = meaning_types
|
|
2003
|
+
dataset.search_keys = combined_search_keys
|
|
2029
2004
|
if email_converted_to_hem:
|
|
2030
2005
|
dataset.ignore_columns = [email_column]
|
|
2031
2006
|
|
|
@@ -2164,14 +2139,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2164
2139
|
|
|
2165
2140
|
key_types = search_keys.values()
|
|
2166
2141
|
|
|
2167
|
-
# Multiple search keys allowed only for PHONE, IP, POSTAL_CODE, EMAIL, HEM
|
|
2168
|
-
multi_keys = [key for key, count in Counter(key_types).items() if count > 1]
|
|
2169
|
-
for multi_key in multi_keys:
|
|
2170
|
-
if multi_key not in [SearchKey.PHONE, SearchKey.IP, SearchKey.POSTAL_CODE, SearchKey.EMAIL, SearchKey.HEM]:
|
|
2171
|
-
msg = self.bundle.get("unsupported_multi_key").format(multi_key)
|
|
2172
|
-
self.logger.warning(msg)
|
|
2173
|
-
raise ValidationError(msg)
|
|
2174
|
-
|
|
2175
2142
|
if SearchKey.DATE in key_types and SearchKey.DATETIME in key_types:
|
|
2176
2143
|
msg = self.bundle.get("date_and_datetime_simultanious")
|
|
2177
2144
|
self.logger.warning(msg)
|
|
@@ -2187,11 +2154,11 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2187
2154
|
self.logger.warning(msg)
|
|
2188
2155
|
raise ValidationError(msg)
|
|
2189
2156
|
|
|
2190
|
-
|
|
2191
|
-
|
|
2192
|
-
|
|
2193
|
-
|
|
2194
|
-
|
|
2157
|
+
for key_type in SearchKey.__members__.values():
|
|
2158
|
+
if key_type != SearchKey.CUSTOM_KEY and list(key_types).count(key_type) > 1:
|
|
2159
|
+
msg = self.bundle.get("multiple_search_key").format(key_type)
|
|
2160
|
+
self.logger.warning(msg)
|
|
2161
|
+
raise ValidationError(msg)
|
|
2195
2162
|
|
|
2196
2163
|
# non_personal_keys = set(SearchKey.__members__.values()) - set(SearchKey.personal_keys())
|
|
2197
2164
|
# if (
|
|
@@ -2329,6 +2296,14 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2329
2296
|
self.logger.info("Input dataset hasn't date column")
|
|
2330
2297
|
if self.add_date_if_missing:
|
|
2331
2298
|
df = self._add_current_date_as_key(df, self.fit_search_keys, self.logger, self.bundle)
|
|
2299
|
+
email_column = self._get_email_column(self.fit_search_keys)
|
|
2300
|
+
hem_column = self._get_hem_column(self.fit_search_keys)
|
|
2301
|
+
email_converted_to_hem = False
|
|
2302
|
+
if email_column:
|
|
2303
|
+
converter = EmailSearchKeyConverter(email_column, hem_column, self.fit_search_keys, self.logger)
|
|
2304
|
+
df = converter.convert(df)
|
|
2305
|
+
self.fit_generated_features.extend(converter.generated_features)
|
|
2306
|
+
email_converted_to_hem = converter.email_converted_to_hem
|
|
2332
2307
|
if (
|
|
2333
2308
|
self.detect_missing_search_keys
|
|
2334
2309
|
and list(self.fit_search_keys.values()) == [SearchKey.DATE]
|
|
@@ -2337,37 +2312,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2337
2312
|
converter = IpToCountrySearchKeyConverter(self.fit_search_keys, self.logger)
|
|
2338
2313
|
df = converter.convert(df)
|
|
2339
2314
|
|
|
2340
|
-
# Explode multiple search keys
|
|
2341
2315
|
non_feature_columns = [self.TARGET_NAME, EVAL_SET_INDEX] + list(self.fit_search_keys.keys())
|
|
2342
|
-
meaning_types = {
|
|
2343
|
-
**{col: key.value for col, key in self.fit_search_keys.items()},
|
|
2344
|
-
**{str(c): FileColumnMeaningType.FEATURE for c in df.columns if c not in non_feature_columns},
|
|
2345
|
-
}
|
|
2346
|
-
meaning_types[self.TARGET_NAME] = FileColumnMeaningType.TARGET
|
|
2347
|
-
if eval_set is not None and len(eval_set) > 0:
|
|
2348
|
-
meaning_types[EVAL_SET_INDEX] = FileColumnMeaningType.EVAL_SET_INDEX
|
|
2349
|
-
df = self.__add_fit_system_record_id(df, meaning_types, self.fit_search_keys, ENTITY_SYSTEM_RECORD_ID)
|
|
2350
|
-
|
|
2351
|
-
# TODO check that this is correct for enrichment
|
|
2352
|
-
self.df_with_original_index = df.copy()
|
|
2353
|
-
|
|
2354
|
-
df, unnest_search_keys = self._explode_multiple_search_keys(df, self.fit_search_keys)
|
|
2355
|
-
|
|
2356
|
-
# Convert EMAIL to HEM after unnesting to do it only with one column
|
|
2357
|
-
email_column = self._get_email_column(self.fit_search_keys)
|
|
2358
|
-
hem_column = self._get_hem_column(self.fit_search_keys)
|
|
2359
|
-
email_converted_to_hem = False
|
|
2360
|
-
if email_column:
|
|
2361
|
-
converter = EmailSearchKeyConverter(
|
|
2362
|
-
email_column, hem_column, self.fit_search_keys, list(unnest_search_keys.keys()), self.logger
|
|
2363
|
-
)
|
|
2364
|
-
df = converter.convert(df)
|
|
2365
|
-
self.fit_generated_features.extend(converter.generated_features)
|
|
2366
|
-
email_converted_to_hem = converter.email_converted_to_hem
|
|
2367
|
-
|
|
2368
|
-
non_feature_columns = [self.TARGET_NAME, EVAL_SET_INDEX, ENTITY_SYSTEM_RECORD_ID, SEARCH_KEY_UNNEST] + list(
|
|
2369
|
-
self.fit_search_keys.keys()
|
|
2370
|
-
)
|
|
2371
2316
|
if email_converted_to_hem:
|
|
2372
2317
|
non_feature_columns.append(email_column)
|
|
2373
2318
|
if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
|
|
@@ -2391,14 +2336,12 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2391
2336
|
**{str(c): FileColumnMeaningType.FEATURE for c in df.columns if c not in non_feature_columns},
|
|
2392
2337
|
}
|
|
2393
2338
|
meaning_types[self.TARGET_NAME] = FileColumnMeaningType.TARGET
|
|
2394
|
-
meaning_types[ENTITY_SYSTEM_RECORD_ID] = FileColumnMeaningType.ENTITY_SYSTEM_RECORD_ID
|
|
2395
|
-
if SEARCH_KEY_UNNEST in df.columns:
|
|
2396
|
-
meaning_types[SEARCH_KEY_UNNEST] = FileColumnMeaningType.UNNEST_KEY
|
|
2397
2339
|
if eval_set is not None and len(eval_set) > 0:
|
|
2398
2340
|
meaning_types[EVAL_SET_INDEX] = FileColumnMeaningType.EVAL_SET_INDEX
|
|
2399
2341
|
|
|
2400
|
-
df = self.__add_fit_system_record_id(df, meaning_types, self.fit_search_keys
|
|
2342
|
+
df = self.__add_fit_system_record_id(df, meaning_types, self.fit_search_keys)
|
|
2401
2343
|
|
|
2344
|
+
self.df_with_original_index = df.copy()
|
|
2402
2345
|
df = df.reset_index(drop=True).sort_values(by=SYSTEM_RECORD_ID).reset_index(drop=True)
|
|
2403
2346
|
|
|
2404
2347
|
combined_search_keys = combine_search_keys(self.fit_search_keys.keys())
|
|
@@ -2406,15 +2349,14 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2406
2349
|
dataset = Dataset(
|
|
2407
2350
|
"tds_" + str(uuid.uuid4()),
|
|
2408
2351
|
df=df,
|
|
2409
|
-
meaning_types=meaning_types,
|
|
2410
|
-
search_keys=combined_search_keys,
|
|
2411
|
-
unnest_search_keys=unnest_search_keys,
|
|
2412
2352
|
model_task_type=model_task_type,
|
|
2413
2353
|
date_format=self.date_format,
|
|
2414
2354
|
random_state=self.random_state,
|
|
2415
2355
|
rest_client=self.rest_client,
|
|
2416
2356
|
logger=self.logger,
|
|
2417
2357
|
)
|
|
2358
|
+
dataset.meaning_types = meaning_types
|
|
2359
|
+
dataset.search_keys = combined_search_keys
|
|
2418
2360
|
if email_converted_to_hem:
|
|
2419
2361
|
dataset.ignore_columns = [email_column]
|
|
2420
2362
|
|
|
@@ -2606,7 +2548,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2606
2548
|
validated_X = X.copy()
|
|
2607
2549
|
elif isinstance(X, pd.Series):
|
|
2608
2550
|
validated_X = X.to_frame()
|
|
2609
|
-
elif isinstance(X, np.ndarray)
|
|
2551
|
+
elif isinstance(X, (list, np.ndarray)):
|
|
2610
2552
|
validated_X = pd.DataFrame(X)
|
|
2611
2553
|
renaming = {c: str(c) for c in validated_X.columns}
|
|
2612
2554
|
validated_X = validated_X.rename(columns=renaming)
|
|
@@ -2695,7 +2637,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2695
2637
|
validated_eval_X = eval_X.copy()
|
|
2696
2638
|
elif isinstance(eval_X, pd.Series):
|
|
2697
2639
|
validated_eval_X = eval_X.to_frame()
|
|
2698
|
-
elif isinstance(eval_X, np.ndarray)
|
|
2640
|
+
elif isinstance(eval_X, (list, np.ndarray)):
|
|
2699
2641
|
validated_eval_X = pd.DataFrame(eval_X)
|
|
2700
2642
|
renaming = {c: str(c) for c in validated_eval_X.columns}
|
|
2701
2643
|
validated_eval_X = validated_eval_X.rename(columns=renaming)
|
|
@@ -2784,10 +2726,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2784
2726
|
X: pd.DataFrame, y: pd.Series, cv: Optional[CVType]
|
|
2785
2727
|
) -> Tuple[pd.DataFrame, pd.Series]:
|
|
2786
2728
|
if cv not in [CVType.time_series, CVType.blocked_time_series]:
|
|
2787
|
-
record_id_column = ENTITY_SYSTEM_RECORD_ID if ENTITY_SYSTEM_RECORD_ID in X else SYSTEM_RECORD_ID
|
|
2788
2729
|
Xy = X.copy()
|
|
2789
2730
|
Xy[TARGET] = y
|
|
2790
|
-
Xy = Xy.sort_values(by=
|
|
2731
|
+
Xy = Xy.sort_values(by=SYSTEM_RECORD_ID).reset_index(drop=True)
|
|
2791
2732
|
X = Xy.drop(columns=TARGET)
|
|
2792
2733
|
y = Xy[TARGET].copy()
|
|
2793
2734
|
|
|
@@ -2878,7 +2819,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2878
2819
|
)
|
|
2879
2820
|
|
|
2880
2821
|
def sample(df):
|
|
2881
|
-
if isinstance(df, pd.
|
|
2822
|
+
if isinstance(df, (pd.DataFrame, pd.Series)):
|
|
2882
2823
|
return df.head(10)
|
|
2883
2824
|
else:
|
|
2884
2825
|
return df[:10]
|
|
@@ -2964,19 +2905,15 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2964
2905
|
|
|
2965
2906
|
@staticmethod
|
|
2966
2907
|
def _get_email_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
|
|
2967
|
-
|
|
2968
|
-
|
|
2969
|
-
|
|
2970
|
-
if len(cols) == 1:
|
|
2971
|
-
return cols[0]
|
|
2908
|
+
for col, t in search_keys.items():
|
|
2909
|
+
if t == SearchKey.EMAIL:
|
|
2910
|
+
return col
|
|
2972
2911
|
|
|
2973
2912
|
@staticmethod
|
|
2974
2913
|
def _get_hem_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
|
|
2975
|
-
|
|
2976
|
-
|
|
2977
|
-
|
|
2978
|
-
if len(cols) == 1:
|
|
2979
|
-
return cols[0]
|
|
2914
|
+
for col, t in search_keys.items():
|
|
2915
|
+
if t == SearchKey.HEM:
|
|
2916
|
+
return col
|
|
2980
2917
|
|
|
2981
2918
|
@staticmethod
|
|
2982
2919
|
def _get_phone_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
|
|
@@ -2984,44 +2921,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2984
2921
|
if t == SearchKey.PHONE:
|
|
2985
2922
|
return col
|
|
2986
2923
|
|
|
2987
|
-
def _explode_multiple_search_keys(
|
|
2988
|
-
self, df: pd.DataFrame, search_keys: Dict[str, SearchKey]
|
|
2989
|
-
) -> Tuple[pd.DataFrame, Dict[str, List[str]]]:
|
|
2990
|
-
# find groups of multiple search keys
|
|
2991
|
-
search_key_names_by_type: Dict[SearchKey, str] = {}
|
|
2992
|
-
for key_name, key_type in search_keys.items():
|
|
2993
|
-
search_key_names_by_type[key_type] = search_key_names_by_type.get(key_type, []) + [key_name]
|
|
2994
|
-
search_key_names_by_type = {
|
|
2995
|
-
key_type: key_names for key_type, key_names in search_key_names_by_type.items() if len(key_names) > 1
|
|
2996
|
-
}
|
|
2997
|
-
if len(search_key_names_by_type) == 0:
|
|
2998
|
-
return df, {}
|
|
2999
|
-
|
|
3000
|
-
multiple_keys_columns = [col for cols in search_key_names_by_type.values() for col in cols]
|
|
3001
|
-
other_columns = [col for col in df.columns if col not in multiple_keys_columns]
|
|
3002
|
-
exploded_dfs = []
|
|
3003
|
-
unnest_search_keys = {}
|
|
3004
|
-
|
|
3005
|
-
for key_type, key_names in search_key_names_by_type.items():
|
|
3006
|
-
new_search_key = f"upgini_{key_type.name.lower()}_unnest"
|
|
3007
|
-
exploded_df = pd.melt(
|
|
3008
|
-
df, id_vars=other_columns, value_vars=key_names, var_name=SEARCH_KEY_UNNEST, value_name=new_search_key
|
|
3009
|
-
)
|
|
3010
|
-
exploded_dfs.append(exploded_df)
|
|
3011
|
-
for old_key in key_names:
|
|
3012
|
-
del search_keys[old_key]
|
|
3013
|
-
search_keys[new_search_key] = key_type
|
|
3014
|
-
unnest_search_keys[new_search_key] = key_names
|
|
3015
|
-
|
|
3016
|
-
df = pd.concat(exploded_dfs, ignore_index=True)
|
|
3017
|
-
return df, unnest_search_keys
|
|
3018
|
-
|
|
3019
2924
|
def __add_fit_system_record_id(
|
|
3020
|
-
self,
|
|
3021
|
-
df: pd.DataFrame,
|
|
3022
|
-
meaning_types: Dict[str, FileColumnMeaningType],
|
|
3023
|
-
search_keys: Dict[str, SearchKey],
|
|
3024
|
-
id_name: str,
|
|
2925
|
+
self, df: pd.DataFrame, meaning_types: Dict[str, FileColumnMeaningType], search_keys: Dict[str, SearchKey]
|
|
3025
2926
|
) -> pd.DataFrame:
|
|
3026
2927
|
# save original order or rows
|
|
3027
2928
|
original_index_name = df.index.name
|
|
@@ -3070,18 +2971,14 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3070
2971
|
|
|
3071
2972
|
df = df.reset_index(drop=True).reset_index()
|
|
3072
2973
|
# system_record_id saves correct order for fit
|
|
3073
|
-
df = df.rename(columns={DEFAULT_INDEX:
|
|
2974
|
+
df = df.rename(columns={DEFAULT_INDEX: SYSTEM_RECORD_ID})
|
|
3074
2975
|
|
|
3075
2976
|
# return original order
|
|
3076
2977
|
df = df.set_index(ORIGINAL_INDEX)
|
|
3077
2978
|
df.index.name = original_index_name
|
|
3078
2979
|
df = df.sort_values(by=original_order_name).drop(columns=original_order_name)
|
|
3079
2980
|
|
|
3080
|
-
meaning_types[
|
|
3081
|
-
FileColumnMeaningType.SYSTEM_RECORD_ID
|
|
3082
|
-
if id_name == SYSTEM_RECORD_ID
|
|
3083
|
-
else FileColumnMeaningType.ENTITY_SYSTEM_RECORD_ID
|
|
3084
|
-
)
|
|
2981
|
+
meaning_types[SYSTEM_RECORD_ID] = FileColumnMeaningType.SYSTEM_RECORD_ID
|
|
3085
2982
|
return df
|
|
3086
2983
|
|
|
3087
2984
|
def __correct_target(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
@@ -3136,11 +3033,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3136
3033
|
)
|
|
3137
3034
|
|
|
3138
3035
|
comparing_columns = X.columns if is_transform else df_with_original_index.columns
|
|
3139
|
-
dup_features = [
|
|
3140
|
-
c
|
|
3141
|
-
for c in comparing_columns
|
|
3142
|
-
if c in result_features.columns and c not in [SYSTEM_RECORD_ID, ENTITY_SYSTEM_RECORD_ID]
|
|
3143
|
-
]
|
|
3036
|
+
dup_features = [c for c in comparing_columns if c in result_features.columns and c != SYSTEM_RECORD_ID]
|
|
3144
3037
|
if len(dup_features) > 0:
|
|
3145
3038
|
self.logger.warning(f"X contain columns with same name as returned from backend: {dup_features}")
|
|
3146
3039
|
raise ValidationError(self.bundle.get("returned_features_same_as_passed").format(dup_features))
|
|
@@ -3151,7 +3044,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3151
3044
|
result_features = pd.merge(
|
|
3152
3045
|
df_with_original_index,
|
|
3153
3046
|
result_features,
|
|
3154
|
-
|
|
3047
|
+
left_on=SYSTEM_RECORD_ID,
|
|
3048
|
+
right_on=SYSTEM_RECORD_ID,
|
|
3155
3049
|
how="left" if is_transform else "inner",
|
|
3156
3050
|
)
|
|
3157
3051
|
result_features = result_features.set_index(original_index_name or DEFAULT_INDEX)
|
|
@@ -3162,7 +3056,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3162
3056
|
result_features = result_features[~result_features[SYSTEM_RECORD_ID].isin(rows_to_drop[SYSTEM_RECORD_ID])]
|
|
3163
3057
|
self.logger.info(f"After dropping target outliers size: {len(result_features)}")
|
|
3164
3058
|
|
|
3165
|
-
result_eval_sets =
|
|
3059
|
+
result_eval_sets = dict()
|
|
3166
3060
|
if not is_transform and EVAL_SET_INDEX in result_features.columns:
|
|
3167
3061
|
result_train_features = result_features.loc[result_features[EVAL_SET_INDEX] == 0].copy()
|
|
3168
3062
|
eval_set_indices = list(result_features[EVAL_SET_INDEX].unique())
|
|
@@ -3368,7 +3262,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3368
3262
|
if autofe_feature.op.is_vector:
|
|
3369
3263
|
continue
|
|
3370
3264
|
|
|
3371
|
-
description =
|
|
3265
|
+
description = dict()
|
|
3372
3266
|
|
|
3373
3267
|
feature_meta = get_feature_by_name(autofe_feature.get_display_name(shorten=True))
|
|
3374
3268
|
if feature_meta is None:
|
|
@@ -3534,13 +3428,13 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3534
3428
|
self.warning_counter.increment()
|
|
3535
3429
|
|
|
3536
3430
|
if len(valid_search_keys) == 1:
|
|
3537
|
-
|
|
3538
|
-
|
|
3539
|
-
|
|
3540
|
-
|
|
3541
|
-
|
|
3542
|
-
|
|
3543
|
-
|
|
3431
|
+
for k, v in valid_search_keys.items():
|
|
3432
|
+
# Show warning for country only if country is the only key
|
|
3433
|
+
if x[k].nunique() == 1 and (v != SearchKey.COUNTRY or len(valid_search_keys) == 1):
|
|
3434
|
+
msg = self.bundle.get("single_constant_search_key").format(v, x[k].values[0])
|
|
3435
|
+
print(msg)
|
|
3436
|
+
self.logger.warning(msg)
|
|
3437
|
+
self.warning_counter.increment()
|
|
3544
3438
|
|
|
3545
3439
|
self.logger.info(f"Prepared search keys: {valid_search_keys}")
|
|
3546
3440
|
|
|
@@ -3650,68 +3544,61 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3650
3544
|
def check_need_detect(search_key: SearchKey):
|
|
3651
3545
|
return not is_transform or search_key in self.fit_search_keys.values()
|
|
3652
3546
|
|
|
3653
|
-
|
|
3654
|
-
|
|
3655
|
-
|
|
3656
|
-
|
|
3657
|
-
|
|
3658
|
-
|
|
3659
|
-
self.autodetected_search_keys.update(new_keys)
|
|
3660
|
-
self.logger.info(f"Autodetected search key POSTAL_CODE in column {maybe_keys}")
|
|
3547
|
+
if SearchKey.POSTAL_CODE not in search_keys.values() and check_need_detect(SearchKey.POSTAL_CODE):
|
|
3548
|
+
maybe_key = PostalCodeSearchKeyDetector().get_search_key_column(sample)
|
|
3549
|
+
if maybe_key is not None:
|
|
3550
|
+
search_keys[maybe_key] = SearchKey.POSTAL_CODE
|
|
3551
|
+
self.autodetected_search_keys[maybe_key] = SearchKey.POSTAL_CODE
|
|
3552
|
+
self.logger.info(f"Autodetected search key POSTAL_CODE in column {maybe_key}")
|
|
3661
3553
|
if not silent_mode:
|
|
3662
|
-
print(self.bundle.get("postal_code_detected").format(
|
|
3554
|
+
print(self.bundle.get("postal_code_detected").format(maybe_key))
|
|
3663
3555
|
|
|
3664
3556
|
if (
|
|
3665
3557
|
SearchKey.COUNTRY not in search_keys.values()
|
|
3666
3558
|
and self.country_code is None
|
|
3667
3559
|
and check_need_detect(SearchKey.COUNTRY)
|
|
3668
3560
|
):
|
|
3669
|
-
maybe_key = CountrySearchKeyDetector().
|
|
3670
|
-
if maybe_key:
|
|
3671
|
-
search_keys[maybe_key
|
|
3672
|
-
self.autodetected_search_keys[maybe_key
|
|
3561
|
+
maybe_key = CountrySearchKeyDetector().get_search_key_column(sample)
|
|
3562
|
+
if maybe_key is not None:
|
|
3563
|
+
search_keys[maybe_key] = SearchKey.COUNTRY
|
|
3564
|
+
self.autodetected_search_keys[maybe_key] = SearchKey.COUNTRY
|
|
3673
3565
|
self.logger.info(f"Autodetected search key COUNTRY in column {maybe_key}")
|
|
3674
3566
|
if not silent_mode:
|
|
3675
3567
|
print(self.bundle.get("country_detected").format(maybe_key))
|
|
3676
3568
|
|
|
3677
3569
|
if (
|
|
3678
|
-
|
|
3679
|
-
SearchKey.HEM not in search_keys.values()
|
|
3570
|
+
SearchKey.EMAIL not in search_keys.values()
|
|
3571
|
+
and SearchKey.HEM not in search_keys.values()
|
|
3680
3572
|
and check_need_detect(SearchKey.HEM)
|
|
3681
3573
|
):
|
|
3682
|
-
|
|
3683
|
-
if
|
|
3574
|
+
maybe_key = EmailSearchKeyDetector().get_search_key_column(sample)
|
|
3575
|
+
if maybe_key is not None and maybe_key not in search_keys.keys():
|
|
3684
3576
|
if self.__is_registered or is_demo_dataset:
|
|
3685
|
-
|
|
3686
|
-
|
|
3687
|
-
self.
|
|
3688
|
-
self.logger.info(f"Autodetected search key EMAIL in column {maybe_keys}")
|
|
3577
|
+
search_keys[maybe_key] = SearchKey.EMAIL
|
|
3578
|
+
self.autodetected_search_keys[maybe_key] = SearchKey.EMAIL
|
|
3579
|
+
self.logger.info(f"Autodetected search key EMAIL in column {maybe_key}")
|
|
3689
3580
|
if not silent_mode:
|
|
3690
|
-
print(self.bundle.get("email_detected").format(
|
|
3581
|
+
print(self.bundle.get("email_detected").format(maybe_key))
|
|
3691
3582
|
else:
|
|
3692
3583
|
self.logger.warning(
|
|
3693
|
-
f"Autodetected search key EMAIL in column {
|
|
3694
|
-
" But not used because not registered user"
|
|
3584
|
+
f"Autodetected search key EMAIL in column {maybe_key}. But not used because not registered user"
|
|
3695
3585
|
)
|
|
3696
3586
|
if not silent_mode:
|
|
3697
|
-
print(self.bundle.get("email_detected_not_registered").format(
|
|
3587
|
+
print(self.bundle.get("email_detected_not_registered").format(maybe_key))
|
|
3698
3588
|
self.warning_counter.increment()
|
|
3699
3589
|
|
|
3700
|
-
|
|
3701
|
-
|
|
3702
|
-
|
|
3703
|
-
if maybe_keys:
|
|
3590
|
+
if SearchKey.PHONE not in search_keys.values() and check_need_detect(SearchKey.PHONE):
|
|
3591
|
+
maybe_key = PhoneSearchKeyDetector().get_search_key_column(sample)
|
|
3592
|
+
if maybe_key is not None and maybe_key not in search_keys.keys():
|
|
3704
3593
|
if self.__is_registered or is_demo_dataset:
|
|
3705
|
-
|
|
3706
|
-
|
|
3707
|
-
self.
|
|
3708
|
-
self.logger.info(f"Autodetected search key PHONE in column {maybe_keys}")
|
|
3594
|
+
search_keys[maybe_key] = SearchKey.PHONE
|
|
3595
|
+
self.autodetected_search_keys[maybe_key] = SearchKey.PHONE
|
|
3596
|
+
self.logger.info(f"Autodetected search key PHONE in column {maybe_key}")
|
|
3709
3597
|
if not silent_mode:
|
|
3710
|
-
print(self.bundle.get("phone_detected").format(
|
|
3598
|
+
print(self.bundle.get("phone_detected").format(maybe_key))
|
|
3711
3599
|
else:
|
|
3712
3600
|
self.logger.warning(
|
|
3713
|
-
f"Autodetected search key PHONE in column {
|
|
3714
|
-
"But not used because not registered user"
|
|
3601
|
+
f"Autodetected search key PHONE in column {maybe_key}. But not used because not registered user"
|
|
3715
3602
|
)
|
|
3716
3603
|
if not silent_mode:
|
|
3717
3604
|
print(self.bundle.get("phone_detected_not_registered"))
|
|
@@ -3806,7 +3693,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3806
3693
|
def sample(inp, sample_index):
|
|
3807
3694
|
if _num_samples(inp) <= 1000:
|
|
3808
3695
|
return inp
|
|
3809
|
-
if isinstance(inp, pd.DataFrame
|
|
3696
|
+
if isinstance(inp, (pd.DataFrame, pd.Series)):
|
|
3810
3697
|
return inp.sample(n=1000, random_state=random_state)
|
|
3811
3698
|
if isinstance(inp, np.ndarray):
|
|
3812
3699
|
return inp[sample_index]
|