upgini 1.1.278a1__py3-none-any.whl → 1.1.279__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- upgini/__about__.py +1 -0
- upgini/ads_management/ads_manager.py +4 -2
- upgini/autofe/all_operands.py +3 -2
- upgini/autofe/binary.py +2 -1
- upgini/autofe/date.py +2 -1
- upgini/autofe/feature.py +1 -1
- upgini/autofe/groupby.py +3 -1
- upgini/autofe/operand.py +4 -3
- upgini/autofe/unary.py +2 -1
- upgini/autofe/vector.py +2 -0
- upgini/dataset.py +6 -15
- upgini/errors.py +1 -1
- upgini/features_enricher.py +102 -214
- upgini/http.py +11 -10
- upgini/mdc/__init__.py +1 -3
- upgini/mdc/context.py +4 -6
- upgini/metadata.py +5 -10
- upgini/metrics.py +102 -100
- upgini/normalizer/phone_normalizer.py +1 -1
- upgini/resource_bundle/__init__.py +5 -5
- upgini/resource_bundle/strings.properties +0 -1
- upgini/sampler/base.py +1 -4
- upgini/sampler/random_under_sampler.py +2 -5
- upgini/search_task.py +4 -4
- upgini/spinner.py +1 -1
- upgini/utils/__init__.py +1 -1
- upgini/utils/base_search_key_detector.py +14 -16
- upgini/utils/blocked_time_series.py +4 -2
- upgini/utils/country_utils.py +1 -1
- upgini/utils/custom_loss_utils.py +3 -2
- upgini/utils/cv_utils.py +2 -2
- upgini/utils/datetime_utils.py +20 -15
- upgini/utils/deduplicate_utils.py +1 -11
- upgini/utils/email_utils.py +2 -7
- upgini/utils/fallback_progress_bar.py +1 -1
- upgini/utils/progress_bar.py +1 -1
- upgini/utils/sklearn_ext.py +14 -13
- upgini/utils/track_info.py +2 -2
- upgini/version_validator.py +2 -2
- {upgini-1.1.278a1.dist-info → upgini-1.1.279.dist-info}/METADATA +21 -23
- upgini-1.1.279.dist-info/RECORD +62 -0
- {upgini-1.1.278a1.dist-info → upgini-1.1.279.dist-info}/WHEEL +1 -2
- upgini/fingerprint.js +0 -8
- upgini-1.1.278a1.dist-info/RECORD +0 -63
- upgini-1.1.278a1.dist-info/top_level.txt +0 -1
- {upgini-1.1.278a1.dist-info → upgini-1.1.279.dist-info/licenses}/LICENSE +0 -0
upgini/features_enricher.py
CHANGED
|
@@ -11,7 +11,6 @@ import sys
|
|
|
11
11
|
import tempfile
|
|
12
12
|
import time
|
|
13
13
|
import uuid
|
|
14
|
-
from collections import Counter
|
|
15
14
|
from dataclasses import dataclass
|
|
16
15
|
from threading import Thread
|
|
17
16
|
from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union
|
|
@@ -46,11 +45,9 @@ from upgini.mdc import MDC
|
|
|
46
45
|
from upgini.metadata import (
|
|
47
46
|
COUNTRY,
|
|
48
47
|
DEFAULT_INDEX,
|
|
49
|
-
ENTITY_SYSTEM_RECORD_ID,
|
|
50
48
|
EVAL_SET_INDEX,
|
|
51
49
|
ORIGINAL_INDEX,
|
|
52
50
|
RENAMED_INDEX,
|
|
53
|
-
SEARCH_KEY_UNNEST,
|
|
54
51
|
SORT_ID,
|
|
55
52
|
SYSTEM_RECORD_ID,
|
|
56
53
|
TARGET,
|
|
@@ -251,7 +248,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
251
248
|
self.__cached_sampled_datasets: Optional[Tuple[pd.DataFrame, pd.DataFrame, pd.Series, Dict, Dict]] = None
|
|
252
249
|
|
|
253
250
|
validate_version(self.logger)
|
|
254
|
-
self.search_keys = search_keys or
|
|
251
|
+
self.search_keys = search_keys or dict()
|
|
255
252
|
self.country_code = country_code
|
|
256
253
|
self.__validate_search_keys(search_keys, search_id)
|
|
257
254
|
self.model_task_type = model_task_type
|
|
@@ -1191,7 +1188,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1191
1188
|
email_column = self._get_email_column(search_keys)
|
|
1192
1189
|
hem_column = self._get_hem_column(search_keys)
|
|
1193
1190
|
if email_column:
|
|
1194
|
-
converter = EmailSearchKeyConverter(email_column, hem_column, search_keys,
|
|
1191
|
+
converter = EmailSearchKeyConverter(email_column, hem_column, search_keys, self.logger)
|
|
1195
1192
|
extended_X = converter.convert(extended_X)
|
|
1196
1193
|
generated_features.extend(converter.generated_features)
|
|
1197
1194
|
if (
|
|
@@ -1407,7 +1404,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1407
1404
|
fitting_enriched_X[col].astype("string").str.replace(",", ".").astype(np.float64)
|
|
1408
1405
|
)
|
|
1409
1406
|
|
|
1410
|
-
fitting_eval_set_dict =
|
|
1407
|
+
fitting_eval_set_dict = dict()
|
|
1411
1408
|
for idx, eval_tuple in eval_set_sampled_dict.items():
|
|
1412
1409
|
eval_X_sampled, enriched_eval_X, eval_y_sampled = eval_tuple
|
|
1413
1410
|
eval_X_sorted, eval_y_sorted = self._sort_by_system_record_id(eval_X_sampled, eval_y_sampled, self.cv)
|
|
@@ -1519,7 +1516,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1519
1516
|
def __sample_only_input(
|
|
1520
1517
|
self, validated_X: pd.DataFrame, validated_y: pd.Series, eval_set: Optional[List[tuple]], is_demo_dataset: bool
|
|
1521
1518
|
) -> _SampledDataForMetrics:
|
|
1522
|
-
eval_set_sampled_dict =
|
|
1519
|
+
eval_set_sampled_dict = dict()
|
|
1523
1520
|
|
|
1524
1521
|
df = validated_X.copy()
|
|
1525
1522
|
df[TARGET] = validated_y
|
|
@@ -1545,7 +1542,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1545
1542
|
df = df.sample(n=sample_rows, random_state=self.random_state)
|
|
1546
1543
|
|
|
1547
1544
|
df_extended, search_keys = self._extend_x(df, is_demo_dataset)
|
|
1548
|
-
df_extended = self.__add_fit_system_record_id(df_extended,
|
|
1545
|
+
df_extended = self.__add_fit_system_record_id(df_extended, dict(), search_keys)
|
|
1549
1546
|
|
|
1550
1547
|
train_df = df_extended.query(f"{EVAL_SET_INDEX} == 0") if eval_set is not None else df_extended
|
|
1551
1548
|
X_sampled = train_df.drop(columns=[TARGET, EVAL_SET_INDEX], errors="ignore")
|
|
@@ -1569,7 +1566,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1569
1566
|
trace_id: str,
|
|
1570
1567
|
remove_outliers_calc_metrics: Optional[bool],
|
|
1571
1568
|
) -> _SampledDataForMetrics:
|
|
1572
|
-
eval_set_sampled_dict =
|
|
1569
|
+
eval_set_sampled_dict = dict()
|
|
1573
1570
|
search_keys = self.fit_search_keys
|
|
1574
1571
|
|
|
1575
1572
|
rows_to_drop = None
|
|
@@ -1643,7 +1640,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1643
1640
|
progress_bar: Optional[ProgressBar],
|
|
1644
1641
|
progress_callback: Optional[Callable[[SearchProgress], Any]],
|
|
1645
1642
|
) -> _SampledDataForMetrics:
|
|
1646
|
-
eval_set_sampled_dict =
|
|
1643
|
+
eval_set_sampled_dict = dict()
|
|
1647
1644
|
if eval_set is not None:
|
|
1648
1645
|
self.logger.info("Transform with eval_set")
|
|
1649
1646
|
# concatenate X and eval_set with eval_set_index
|
|
@@ -1665,7 +1662,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1665
1662
|
self.logger.info(f"Downsampling from {num_samples} to {Dataset.FIT_SAMPLE_WITH_EVAL_SET_ROWS}")
|
|
1666
1663
|
df = df.sample(n=Dataset.FIT_SAMPLE_WITH_EVAL_SET_ROWS, random_state=self.random_state)
|
|
1667
1664
|
|
|
1668
|
-
eval_set_sampled_dict =
|
|
1665
|
+
eval_set_sampled_dict = dict()
|
|
1669
1666
|
|
|
1670
1667
|
tmp_target_name = "__target"
|
|
1671
1668
|
df = df.rename(columns={TARGET: tmp_target_name})
|
|
@@ -1928,38 +1925,11 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1928
1925
|
self.logger.info("Input dataset hasn't date column")
|
|
1929
1926
|
if self.add_date_if_missing:
|
|
1930
1927
|
df = self._add_current_date_as_key(df, search_keys, self.logger, self.bundle)
|
|
1931
|
-
|
|
1932
|
-
# Don't pass all features in backend on transform
|
|
1933
|
-
original_features_for_transform = []
|
|
1934
|
-
runtime_parameters = self._get_copy_of_runtime_parameters()
|
|
1935
|
-
features_not_to_pass = [column for column in df.columns if column not in search_keys.keys()]
|
|
1936
|
-
if len(features_not_to_pass) > 0:
|
|
1937
|
-
# Pass only features that need for transform
|
|
1938
|
-
features_for_transform = self._search_task.get_features_for_transform()
|
|
1939
|
-
if features_for_transform is not None and len(features_for_transform) > 0:
|
|
1940
|
-
file_metadata = self._search_task.get_file_metadata(trace_id)
|
|
1941
|
-
original_features_for_transform = [
|
|
1942
|
-
c.originalName or c.name for c in file_metadata.columns if c.name in features_for_transform
|
|
1943
|
-
]
|
|
1944
|
-
|
|
1945
|
-
runtime_parameters.properties["features_for_embeddings"] = ",".join(features_for_transform)
|
|
1946
|
-
|
|
1947
|
-
columns_for_system_record_id = sorted(list(search_keys.keys()) + (original_features_for_transform))
|
|
1948
|
-
|
|
1949
|
-
df[ENTITY_SYSTEM_RECORD_ID] = pd.util.hash_pandas_object(
|
|
1950
|
-
df[columns_for_system_record_id], index=False
|
|
1951
|
-
).astype("Float64")
|
|
1952
|
-
|
|
1953
|
-
# Explode multiple search keys
|
|
1954
|
-
df, unnest_search_keys = self._explode_multiple_search_keys(df, search_keys)
|
|
1955
|
-
|
|
1956
1928
|
email_column = self._get_email_column(search_keys)
|
|
1957
1929
|
hem_column = self._get_hem_column(search_keys)
|
|
1958
1930
|
email_converted_to_hem = False
|
|
1959
1931
|
if email_column:
|
|
1960
|
-
converter = EmailSearchKeyConverter(
|
|
1961
|
-
email_column, hem_column, search_keys, list(unnest_search_keys.keys()), self.logger
|
|
1962
|
-
)
|
|
1932
|
+
converter = EmailSearchKeyConverter(email_column, hem_column, search_keys, self.logger)
|
|
1963
1933
|
df = converter.convert(df)
|
|
1964
1934
|
generated_features.extend(converter.generated_features)
|
|
1965
1935
|
email_converted_to_hem = converter.email_converted_to_hem
|
|
@@ -1973,21 +1943,30 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1973
1943
|
generated_features = [f for f in generated_features if f in self.fit_generated_features]
|
|
1974
1944
|
|
|
1975
1945
|
meaning_types = {col: key.value for col, key in search_keys.items()}
|
|
1976
|
-
|
|
1977
|
-
for col in original_features_for_transform:
|
|
1978
|
-
meaning_types[col] = FileColumnMeaningType.FEATURE
|
|
1979
|
-
features_not_to_pass = [column for column in features_not_to_pass if column not in search_keys.keys()]
|
|
1946
|
+
non_keys_columns = [column for column in df.columns if column not in search_keys.keys()]
|
|
1980
1947
|
|
|
1981
1948
|
if email_converted_to_hem:
|
|
1982
|
-
|
|
1949
|
+
non_keys_columns.append(email_column)
|
|
1950
|
+
|
|
1951
|
+
# Don't pass features in backend on transform
|
|
1952
|
+
original_features_for_transform = None
|
|
1953
|
+
runtime_parameters = self._get_copy_of_runtime_parameters()
|
|
1954
|
+
if len(non_keys_columns) > 0:
|
|
1955
|
+
# Pass only features that need for transform
|
|
1956
|
+
features_for_transform = self._search_task.get_features_for_transform()
|
|
1957
|
+
if features_for_transform is not None and len(features_for_transform) > 0:
|
|
1958
|
+
file_metadata = self._search_task.get_file_metadata(trace_id)
|
|
1959
|
+
original_features_for_transform = [
|
|
1960
|
+
c.originalName or c.name for c in file_metadata.columns if c.name in features_for_transform
|
|
1961
|
+
]
|
|
1962
|
+
non_keys_columns = [c for c in non_keys_columns if c not in original_features_for_transform]
|
|
1983
1963
|
|
|
1984
|
-
|
|
1985
|
-
columns_for_system_record_id = sorted(list(search_keys.keys()) + (original_features_for_transform))
|
|
1964
|
+
runtime_parameters.properties["features_for_embeddings"] = ",".join(features_for_transform)
|
|
1986
1965
|
|
|
1987
1966
|
if add_fit_system_record_id:
|
|
1988
|
-
df = self.__add_fit_system_record_id(df,
|
|
1967
|
+
df = self.__add_fit_system_record_id(df, dict(), search_keys)
|
|
1989
1968
|
df = df.rename(columns={SYSTEM_RECORD_ID: SORT_ID})
|
|
1990
|
-
|
|
1969
|
+
non_keys_columns.append(SORT_ID)
|
|
1991
1970
|
|
|
1992
1971
|
columns_for_system_record_id = sorted(list(search_keys.keys()) + (original_features_for_transform or []))
|
|
1993
1972
|
|
|
@@ -1995,19 +1974,16 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1995
1974
|
"Float64"
|
|
1996
1975
|
)
|
|
1997
1976
|
meaning_types[SYSTEM_RECORD_ID] = FileColumnMeaningType.SYSTEM_RECORD_ID
|
|
1998
|
-
meaning_types[ENTITY_SYSTEM_RECORD_ID] = FileColumnMeaningType.ENTITY_SYSTEM_RECORD_ID
|
|
1999
|
-
if SEARCH_KEY_UNNEST in df.columns:
|
|
2000
|
-
meaning_types[SEARCH_KEY_UNNEST] = FileColumnMeaningType.UNNEST_KEY
|
|
2001
1977
|
|
|
2002
1978
|
df = df.reset_index(drop=True)
|
|
2003
|
-
system_columns_with_original_index = [SYSTEM_RECORD_ID
|
|
1979
|
+
system_columns_with_original_index = [SYSTEM_RECORD_ID] + generated_features
|
|
2004
1980
|
if add_fit_system_record_id:
|
|
2005
1981
|
system_columns_with_original_index.append(SORT_ID)
|
|
2006
1982
|
df_with_original_index = df[system_columns_with_original_index].copy()
|
|
2007
1983
|
|
|
2008
1984
|
combined_search_keys = combine_search_keys(search_keys.keys())
|
|
2009
1985
|
|
|
2010
|
-
df_without_features = df.drop(columns=
|
|
1986
|
+
df_without_features = df.drop(columns=non_keys_columns)
|
|
2011
1987
|
|
|
2012
1988
|
df_without_features = clean_full_duplicates(
|
|
2013
1989
|
df_without_features, self.logger, silent=silent_mode, bundle=self.bundle
|
|
@@ -2019,13 +1995,12 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2019
1995
|
dataset = Dataset(
|
|
2020
1996
|
"sample_" + str(uuid.uuid4()),
|
|
2021
1997
|
df=df_without_features,
|
|
2022
|
-
meaning_types=meaning_types,
|
|
2023
|
-
search_keys=combined_search_keys,
|
|
2024
|
-
unnest_search_keys=unnest_search_keys,
|
|
2025
1998
|
date_format=self.date_format,
|
|
2026
1999
|
rest_client=self.rest_client,
|
|
2027
2000
|
logger=self.logger,
|
|
2028
2001
|
)
|
|
2002
|
+
dataset.meaning_types = meaning_types
|
|
2003
|
+
dataset.search_keys = combined_search_keys
|
|
2029
2004
|
if email_converted_to_hem:
|
|
2030
2005
|
dataset.ignore_columns = [email_column]
|
|
2031
2006
|
|
|
@@ -2164,14 +2139,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2164
2139
|
|
|
2165
2140
|
key_types = search_keys.values()
|
|
2166
2141
|
|
|
2167
|
-
# Multiple search keys allowed only for PHONE, IP, POSTAL_CODE, EMAIL, HEM
|
|
2168
|
-
multi_keys = [key for key, count in Counter(key_types).items() if count > 1]
|
|
2169
|
-
for multi_key in multi_keys:
|
|
2170
|
-
if multi_key not in [SearchKey.PHONE, SearchKey.IP, SearchKey.POSTAL_CODE, SearchKey.EMAIL, SearchKey.HEM]:
|
|
2171
|
-
msg = self.bundle.get("unsupported_multi_key").format(multi_key)
|
|
2172
|
-
self.logger.warning(msg)
|
|
2173
|
-
raise ValidationError(msg)
|
|
2174
|
-
|
|
2175
2142
|
if SearchKey.DATE in key_types and SearchKey.DATETIME in key_types:
|
|
2176
2143
|
msg = self.bundle.get("date_and_datetime_simultanious")
|
|
2177
2144
|
self.logger.warning(msg)
|
|
@@ -2187,11 +2154,11 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2187
2154
|
self.logger.warning(msg)
|
|
2188
2155
|
raise ValidationError(msg)
|
|
2189
2156
|
|
|
2190
|
-
|
|
2191
|
-
|
|
2192
|
-
|
|
2193
|
-
|
|
2194
|
-
|
|
2157
|
+
for key_type in SearchKey.__members__.values():
|
|
2158
|
+
if key_type != SearchKey.CUSTOM_KEY and list(key_types).count(key_type) > 1:
|
|
2159
|
+
msg = self.bundle.get("multiple_search_key").format(key_type)
|
|
2160
|
+
self.logger.warning(msg)
|
|
2161
|
+
raise ValidationError(msg)
|
|
2195
2162
|
|
|
2196
2163
|
# non_personal_keys = set(SearchKey.__members__.values()) - set(SearchKey.personal_keys())
|
|
2197
2164
|
# if (
|
|
@@ -2329,6 +2296,14 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2329
2296
|
self.logger.info("Input dataset hasn't date column")
|
|
2330
2297
|
if self.add_date_if_missing:
|
|
2331
2298
|
df = self._add_current_date_as_key(df, self.fit_search_keys, self.logger, self.bundle)
|
|
2299
|
+
email_column = self._get_email_column(self.fit_search_keys)
|
|
2300
|
+
hem_column = self._get_hem_column(self.fit_search_keys)
|
|
2301
|
+
email_converted_to_hem = False
|
|
2302
|
+
if email_column:
|
|
2303
|
+
converter = EmailSearchKeyConverter(email_column, hem_column, self.fit_search_keys, self.logger)
|
|
2304
|
+
df = converter.convert(df)
|
|
2305
|
+
self.fit_generated_features.extend(converter.generated_features)
|
|
2306
|
+
email_converted_to_hem = converter.email_converted_to_hem
|
|
2332
2307
|
if (
|
|
2333
2308
|
self.detect_missing_search_keys
|
|
2334
2309
|
and list(self.fit_search_keys.values()) == [SearchKey.DATE]
|
|
@@ -2337,37 +2312,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2337
2312
|
converter = IpToCountrySearchKeyConverter(self.fit_search_keys, self.logger)
|
|
2338
2313
|
df = converter.convert(df)
|
|
2339
2314
|
|
|
2340
|
-
# Explode multiple search keys
|
|
2341
2315
|
non_feature_columns = [self.TARGET_NAME, EVAL_SET_INDEX] + list(self.fit_search_keys.keys())
|
|
2342
|
-
meaning_types = {
|
|
2343
|
-
**{col: key.value for col, key in self.fit_search_keys.items()},
|
|
2344
|
-
**{str(c): FileColumnMeaningType.FEATURE for c in df.columns if c not in non_feature_columns},
|
|
2345
|
-
}
|
|
2346
|
-
meaning_types[self.TARGET_NAME] = FileColumnMeaningType.TARGET
|
|
2347
|
-
if eval_set is not None and len(eval_set) > 0:
|
|
2348
|
-
meaning_types[EVAL_SET_INDEX] = FileColumnMeaningType.EVAL_SET_INDEX
|
|
2349
|
-
df = self.__add_fit_system_record_id(df, meaning_types, self.fit_search_keys, ENTITY_SYSTEM_RECORD_ID)
|
|
2350
|
-
|
|
2351
|
-
# TODO check that this is correct for enrichment
|
|
2352
|
-
self.df_with_original_index = df.copy()
|
|
2353
|
-
|
|
2354
|
-
df, unnest_search_keys = self._explode_multiple_search_keys(df, self.fit_search_keys)
|
|
2355
|
-
|
|
2356
|
-
# Convert EMAIL to HEM after unnesting to do it only with one column
|
|
2357
|
-
email_column = self._get_email_column(self.fit_search_keys)
|
|
2358
|
-
hem_column = self._get_hem_column(self.fit_search_keys)
|
|
2359
|
-
email_converted_to_hem = False
|
|
2360
|
-
if email_column:
|
|
2361
|
-
converter = EmailSearchKeyConverter(
|
|
2362
|
-
email_column, hem_column, self.fit_search_keys, list(unnest_search_keys.keys()), self.logger
|
|
2363
|
-
)
|
|
2364
|
-
df = converter.convert(df)
|
|
2365
|
-
self.fit_generated_features.extend(converter.generated_features)
|
|
2366
|
-
email_converted_to_hem = converter.email_converted_to_hem
|
|
2367
|
-
|
|
2368
|
-
non_feature_columns = [self.TARGET_NAME, EVAL_SET_INDEX, ENTITY_SYSTEM_RECORD_ID, SEARCH_KEY_UNNEST] + list(
|
|
2369
|
-
self.fit_search_keys.keys()
|
|
2370
|
-
)
|
|
2371
2316
|
if email_converted_to_hem:
|
|
2372
2317
|
non_feature_columns.append(email_column)
|
|
2373
2318
|
if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
|
|
@@ -2391,14 +2336,12 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2391
2336
|
**{str(c): FileColumnMeaningType.FEATURE for c in df.columns if c not in non_feature_columns},
|
|
2392
2337
|
}
|
|
2393
2338
|
meaning_types[self.TARGET_NAME] = FileColumnMeaningType.TARGET
|
|
2394
|
-
meaning_types[ENTITY_SYSTEM_RECORD_ID] = FileColumnMeaningType.ENTITY_SYSTEM_RECORD_ID
|
|
2395
|
-
if SEARCH_KEY_UNNEST in df.columns:
|
|
2396
|
-
meaning_types[SEARCH_KEY_UNNEST] = FileColumnMeaningType.UNNEST_KEY
|
|
2397
2339
|
if eval_set is not None and len(eval_set) > 0:
|
|
2398
2340
|
meaning_types[EVAL_SET_INDEX] = FileColumnMeaningType.EVAL_SET_INDEX
|
|
2399
2341
|
|
|
2400
|
-
df = self.__add_fit_system_record_id(df, meaning_types, self.fit_search_keys
|
|
2342
|
+
df = self.__add_fit_system_record_id(df, meaning_types, self.fit_search_keys)
|
|
2401
2343
|
|
|
2344
|
+
self.df_with_original_index = df.copy()
|
|
2402
2345
|
df = df.reset_index(drop=True).sort_values(by=SYSTEM_RECORD_ID).reset_index(drop=True)
|
|
2403
2346
|
|
|
2404
2347
|
combined_search_keys = combine_search_keys(self.fit_search_keys.keys())
|
|
@@ -2406,15 +2349,14 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2406
2349
|
dataset = Dataset(
|
|
2407
2350
|
"tds_" + str(uuid.uuid4()),
|
|
2408
2351
|
df=df,
|
|
2409
|
-
meaning_types=meaning_types,
|
|
2410
|
-
search_keys=combined_search_keys,
|
|
2411
|
-
unnest_search_keys=unnest_search_keys,
|
|
2412
2352
|
model_task_type=model_task_type,
|
|
2413
2353
|
date_format=self.date_format,
|
|
2414
2354
|
random_state=self.random_state,
|
|
2415
2355
|
rest_client=self.rest_client,
|
|
2416
2356
|
logger=self.logger,
|
|
2417
2357
|
)
|
|
2358
|
+
dataset.meaning_types = meaning_types
|
|
2359
|
+
dataset.search_keys = combined_search_keys
|
|
2418
2360
|
if email_converted_to_hem:
|
|
2419
2361
|
dataset.ignore_columns = [email_column]
|
|
2420
2362
|
|
|
@@ -2606,7 +2548,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2606
2548
|
validated_X = X.copy()
|
|
2607
2549
|
elif isinstance(X, pd.Series):
|
|
2608
2550
|
validated_X = X.to_frame()
|
|
2609
|
-
elif isinstance(X, np.ndarray)
|
|
2551
|
+
elif isinstance(X, (list, np.ndarray)):
|
|
2610
2552
|
validated_X = pd.DataFrame(X)
|
|
2611
2553
|
renaming = {c: str(c) for c in validated_X.columns}
|
|
2612
2554
|
validated_X = validated_X.rename(columns=renaming)
|
|
@@ -2695,7 +2637,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2695
2637
|
validated_eval_X = eval_X.copy()
|
|
2696
2638
|
elif isinstance(eval_X, pd.Series):
|
|
2697
2639
|
validated_eval_X = eval_X.to_frame()
|
|
2698
|
-
elif isinstance(eval_X, np.ndarray)
|
|
2640
|
+
elif isinstance(eval_X, (list, np.ndarray)):
|
|
2699
2641
|
validated_eval_X = pd.DataFrame(eval_X)
|
|
2700
2642
|
renaming = {c: str(c) for c in validated_eval_X.columns}
|
|
2701
2643
|
validated_eval_X = validated_eval_X.rename(columns=renaming)
|
|
@@ -2877,7 +2819,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2877
2819
|
)
|
|
2878
2820
|
|
|
2879
2821
|
def sample(df):
|
|
2880
|
-
if isinstance(df, pd.
|
|
2822
|
+
if isinstance(df, (pd.DataFrame, pd.Series)):
|
|
2881
2823
|
return df.head(10)
|
|
2882
2824
|
else:
|
|
2883
2825
|
return df[:10]
|
|
@@ -2963,19 +2905,15 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2963
2905
|
|
|
2964
2906
|
@staticmethod
|
|
2965
2907
|
def _get_email_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
|
|
2966
|
-
|
|
2967
|
-
|
|
2968
|
-
|
|
2969
|
-
if len(cols) == 1:
|
|
2970
|
-
return cols[0]
|
|
2908
|
+
for col, t in search_keys.items():
|
|
2909
|
+
if t == SearchKey.EMAIL:
|
|
2910
|
+
return col
|
|
2971
2911
|
|
|
2972
2912
|
@staticmethod
|
|
2973
2913
|
def _get_hem_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
|
|
2974
|
-
|
|
2975
|
-
|
|
2976
|
-
|
|
2977
|
-
if len(cols) == 1:
|
|
2978
|
-
return cols[0]
|
|
2914
|
+
for col, t in search_keys.items():
|
|
2915
|
+
if t == SearchKey.HEM:
|
|
2916
|
+
return col
|
|
2979
2917
|
|
|
2980
2918
|
@staticmethod
|
|
2981
2919
|
def _get_phone_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
|
|
@@ -2983,44 +2921,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2983
2921
|
if t == SearchKey.PHONE:
|
|
2984
2922
|
return col
|
|
2985
2923
|
|
|
2986
|
-
def _explode_multiple_search_keys(
|
|
2987
|
-
self, df: pd.DataFrame, search_keys: Dict[str, SearchKey]
|
|
2988
|
-
) -> Tuple[pd.DataFrame, Dict[str, List[str]]]:
|
|
2989
|
-
# find groups of multiple search keys
|
|
2990
|
-
search_key_names_by_type: Dict[SearchKey, str] = {}
|
|
2991
|
-
for key_name, key_type in search_keys.items():
|
|
2992
|
-
search_key_names_by_type[key_type] = search_key_names_by_type.get(key_type, []) + [key_name]
|
|
2993
|
-
search_key_names_by_type = {
|
|
2994
|
-
key_type: key_names for key_type, key_names in search_key_names_by_type.items() if len(key_names) > 1
|
|
2995
|
-
}
|
|
2996
|
-
if len(search_key_names_by_type) == 0:
|
|
2997
|
-
return df, {}
|
|
2998
|
-
|
|
2999
|
-
multiple_keys_columns = [col for cols in search_key_names_by_type.values() for col in cols]
|
|
3000
|
-
other_columns = [col for col in df.columns if col not in multiple_keys_columns]
|
|
3001
|
-
exploded_dfs = []
|
|
3002
|
-
unnest_search_keys = {}
|
|
3003
|
-
|
|
3004
|
-
for key_type, key_names in search_key_names_by_type.items():
|
|
3005
|
-
new_search_key = f"upgini_{key_type.name.lower()}_unnest"
|
|
3006
|
-
exploded_df = pd.melt(
|
|
3007
|
-
df, id_vars=other_columns, value_vars=key_names, var_name=SEARCH_KEY_UNNEST, value_name=new_search_key
|
|
3008
|
-
)
|
|
3009
|
-
exploded_dfs.append(exploded_df)
|
|
3010
|
-
for old_key in key_names:
|
|
3011
|
-
del search_keys[old_key]
|
|
3012
|
-
search_keys[new_search_key] = key_type
|
|
3013
|
-
unnest_search_keys[new_search_key] = key_names
|
|
3014
|
-
|
|
3015
|
-
df = pd.concat(exploded_dfs, ignore_index=True)
|
|
3016
|
-
return df, unnest_search_keys
|
|
3017
|
-
|
|
3018
2924
|
def __add_fit_system_record_id(
|
|
3019
|
-
self,
|
|
3020
|
-
df: pd.DataFrame,
|
|
3021
|
-
meaning_types: Dict[str, FileColumnMeaningType],
|
|
3022
|
-
search_keys: Dict[str, SearchKey],
|
|
3023
|
-
id_name: str,
|
|
2925
|
+
self, df: pd.DataFrame, meaning_types: Dict[str, FileColumnMeaningType], search_keys: Dict[str, SearchKey]
|
|
3024
2926
|
) -> pd.DataFrame:
|
|
3025
2927
|
# save original order or rows
|
|
3026
2928
|
original_index_name = df.index.name
|
|
@@ -3069,18 +2971,14 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3069
2971
|
|
|
3070
2972
|
df = df.reset_index(drop=True).reset_index()
|
|
3071
2973
|
# system_record_id saves correct order for fit
|
|
3072
|
-
df = df.rename(columns={DEFAULT_INDEX:
|
|
2974
|
+
df = df.rename(columns={DEFAULT_INDEX: SYSTEM_RECORD_ID})
|
|
3073
2975
|
|
|
3074
2976
|
# return original order
|
|
3075
2977
|
df = df.set_index(ORIGINAL_INDEX)
|
|
3076
2978
|
df.index.name = original_index_name
|
|
3077
2979
|
df = df.sort_values(by=original_order_name).drop(columns=original_order_name)
|
|
3078
2980
|
|
|
3079
|
-
meaning_types[
|
|
3080
|
-
FileColumnMeaningType.SYSTEM_RECORD_ID
|
|
3081
|
-
if id_name == SYSTEM_RECORD_ID
|
|
3082
|
-
else FileColumnMeaningType.ENTITY_SYSTEM_RECORD_ID
|
|
3083
|
-
)
|
|
2981
|
+
meaning_types[SYSTEM_RECORD_ID] = FileColumnMeaningType.SYSTEM_RECORD_ID
|
|
3084
2982
|
return df
|
|
3085
2983
|
|
|
3086
2984
|
def __correct_target(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
@@ -3135,11 +3033,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3135
3033
|
)
|
|
3136
3034
|
|
|
3137
3035
|
comparing_columns = X.columns if is_transform else df_with_original_index.columns
|
|
3138
|
-
dup_features = [
|
|
3139
|
-
c
|
|
3140
|
-
for c in comparing_columns
|
|
3141
|
-
if c in result_features.columns and c not in [SYSTEM_RECORD_ID, ENTITY_SYSTEM_RECORD_ID]
|
|
3142
|
-
]
|
|
3036
|
+
dup_features = [c for c in comparing_columns if c in result_features.columns and c != SYSTEM_RECORD_ID]
|
|
3143
3037
|
if len(dup_features) > 0:
|
|
3144
3038
|
self.logger.warning(f"X contain columns with same name as returned from backend: {dup_features}")
|
|
3145
3039
|
raise ValidationError(self.bundle.get("returned_features_same_as_passed").format(dup_features))
|
|
@@ -3150,7 +3044,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3150
3044
|
result_features = pd.merge(
|
|
3151
3045
|
df_with_original_index,
|
|
3152
3046
|
result_features,
|
|
3153
|
-
|
|
3047
|
+
left_on=SYSTEM_RECORD_ID,
|
|
3048
|
+
right_on=SYSTEM_RECORD_ID,
|
|
3154
3049
|
how="left" if is_transform else "inner",
|
|
3155
3050
|
)
|
|
3156
3051
|
result_features = result_features.set_index(original_index_name or DEFAULT_INDEX)
|
|
@@ -3161,7 +3056,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3161
3056
|
result_features = result_features[~result_features[SYSTEM_RECORD_ID].isin(rows_to_drop[SYSTEM_RECORD_ID])]
|
|
3162
3057
|
self.logger.info(f"After dropping target outliers size: {len(result_features)}")
|
|
3163
3058
|
|
|
3164
|
-
result_eval_sets =
|
|
3059
|
+
result_eval_sets = dict()
|
|
3165
3060
|
if not is_transform and EVAL_SET_INDEX in result_features.columns:
|
|
3166
3061
|
result_train_features = result_features.loc[result_features[EVAL_SET_INDEX] == 0].copy()
|
|
3167
3062
|
eval_set_indices = list(result_features[EVAL_SET_INDEX].unique())
|
|
@@ -3367,7 +3262,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3367
3262
|
if autofe_feature.op.is_vector:
|
|
3368
3263
|
continue
|
|
3369
3264
|
|
|
3370
|
-
description =
|
|
3265
|
+
description = dict()
|
|
3371
3266
|
|
|
3372
3267
|
feature_meta = get_feature_by_name(autofe_feature.get_display_name(shorten=True))
|
|
3373
3268
|
if feature_meta is None:
|
|
@@ -3533,13 +3428,13 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3533
3428
|
self.warning_counter.increment()
|
|
3534
3429
|
|
|
3535
3430
|
if len(valid_search_keys) == 1:
|
|
3536
|
-
|
|
3537
|
-
|
|
3538
|
-
|
|
3539
|
-
|
|
3540
|
-
|
|
3541
|
-
|
|
3542
|
-
|
|
3431
|
+
for k, v in valid_search_keys.items():
|
|
3432
|
+
# Show warning for country only if country is the only key
|
|
3433
|
+
if x[k].nunique() == 1 and (v != SearchKey.COUNTRY or len(valid_search_keys) == 1):
|
|
3434
|
+
msg = self.bundle.get("single_constant_search_key").format(v, x[k].values[0])
|
|
3435
|
+
print(msg)
|
|
3436
|
+
self.logger.warning(msg)
|
|
3437
|
+
self.warning_counter.increment()
|
|
3543
3438
|
|
|
3544
3439
|
self.logger.info(f"Prepared search keys: {valid_search_keys}")
|
|
3545
3440
|
|
|
@@ -3649,68 +3544,61 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3649
3544
|
def check_need_detect(search_key: SearchKey):
|
|
3650
3545
|
return not is_transform or search_key in self.fit_search_keys.values()
|
|
3651
3546
|
|
|
3652
|
-
|
|
3653
|
-
|
|
3654
|
-
|
|
3655
|
-
|
|
3656
|
-
|
|
3657
|
-
|
|
3658
|
-
self.autodetected_search_keys.update(new_keys)
|
|
3659
|
-
self.logger.info(f"Autodetected search key POSTAL_CODE in column {maybe_keys}")
|
|
3547
|
+
if SearchKey.POSTAL_CODE not in search_keys.values() and check_need_detect(SearchKey.POSTAL_CODE):
|
|
3548
|
+
maybe_key = PostalCodeSearchKeyDetector().get_search_key_column(sample)
|
|
3549
|
+
if maybe_key is not None:
|
|
3550
|
+
search_keys[maybe_key] = SearchKey.POSTAL_CODE
|
|
3551
|
+
self.autodetected_search_keys[maybe_key] = SearchKey.POSTAL_CODE
|
|
3552
|
+
self.logger.info(f"Autodetected search key POSTAL_CODE in column {maybe_key}")
|
|
3660
3553
|
if not silent_mode:
|
|
3661
|
-
print(self.bundle.get("postal_code_detected").format(
|
|
3554
|
+
print(self.bundle.get("postal_code_detected").format(maybe_key))
|
|
3662
3555
|
|
|
3663
3556
|
if (
|
|
3664
3557
|
SearchKey.COUNTRY not in search_keys.values()
|
|
3665
3558
|
and self.country_code is None
|
|
3666
3559
|
and check_need_detect(SearchKey.COUNTRY)
|
|
3667
3560
|
):
|
|
3668
|
-
maybe_key = CountrySearchKeyDetector().
|
|
3669
|
-
if maybe_key:
|
|
3670
|
-
search_keys[maybe_key
|
|
3671
|
-
self.autodetected_search_keys[maybe_key
|
|
3561
|
+
maybe_key = CountrySearchKeyDetector().get_search_key_column(sample)
|
|
3562
|
+
if maybe_key is not None:
|
|
3563
|
+
search_keys[maybe_key] = SearchKey.COUNTRY
|
|
3564
|
+
self.autodetected_search_keys[maybe_key] = SearchKey.COUNTRY
|
|
3672
3565
|
self.logger.info(f"Autodetected search key COUNTRY in column {maybe_key}")
|
|
3673
3566
|
if not silent_mode:
|
|
3674
3567
|
print(self.bundle.get("country_detected").format(maybe_key))
|
|
3675
3568
|
|
|
3676
3569
|
if (
|
|
3677
|
-
|
|
3678
|
-
SearchKey.HEM not in search_keys.values()
|
|
3570
|
+
SearchKey.EMAIL not in search_keys.values()
|
|
3571
|
+
and SearchKey.HEM not in search_keys.values()
|
|
3679
3572
|
and check_need_detect(SearchKey.HEM)
|
|
3680
3573
|
):
|
|
3681
|
-
|
|
3682
|
-
if
|
|
3574
|
+
maybe_key = EmailSearchKeyDetector().get_search_key_column(sample)
|
|
3575
|
+
if maybe_key is not None and maybe_key not in search_keys.keys():
|
|
3683
3576
|
if self.__is_registered or is_demo_dataset:
|
|
3684
|
-
|
|
3685
|
-
|
|
3686
|
-
self.
|
|
3687
|
-
self.logger.info(f"Autodetected search key EMAIL in column {maybe_keys}")
|
|
3577
|
+
search_keys[maybe_key] = SearchKey.EMAIL
|
|
3578
|
+
self.autodetected_search_keys[maybe_key] = SearchKey.EMAIL
|
|
3579
|
+
self.logger.info(f"Autodetected search key EMAIL in column {maybe_key}")
|
|
3688
3580
|
if not silent_mode:
|
|
3689
|
-
print(self.bundle.get("email_detected").format(
|
|
3581
|
+
print(self.bundle.get("email_detected").format(maybe_key))
|
|
3690
3582
|
else:
|
|
3691
3583
|
self.logger.warning(
|
|
3692
|
-
f"Autodetected search key EMAIL in column {
|
|
3693
|
-
" But not used because not registered user"
|
|
3584
|
+
f"Autodetected search key EMAIL in column {maybe_key}. But not used because not registered user"
|
|
3694
3585
|
)
|
|
3695
3586
|
if not silent_mode:
|
|
3696
|
-
print(self.bundle.get("email_detected_not_registered").format(
|
|
3587
|
+
print(self.bundle.get("email_detected_not_registered").format(maybe_key))
|
|
3697
3588
|
self.warning_counter.increment()
|
|
3698
3589
|
|
|
3699
|
-
|
|
3700
|
-
|
|
3701
|
-
|
|
3702
|
-
if maybe_keys:
|
|
3590
|
+
if SearchKey.PHONE not in search_keys.values() and check_need_detect(SearchKey.PHONE):
|
|
3591
|
+
maybe_key = PhoneSearchKeyDetector().get_search_key_column(sample)
|
|
3592
|
+
if maybe_key is not None and maybe_key not in search_keys.keys():
|
|
3703
3593
|
if self.__is_registered or is_demo_dataset:
|
|
3704
|
-
|
|
3705
|
-
|
|
3706
|
-
self.
|
|
3707
|
-
self.logger.info(f"Autodetected search key PHONE in column {maybe_keys}")
|
|
3594
|
+
search_keys[maybe_key] = SearchKey.PHONE
|
|
3595
|
+
self.autodetected_search_keys[maybe_key] = SearchKey.PHONE
|
|
3596
|
+
self.logger.info(f"Autodetected search key PHONE in column {maybe_key}")
|
|
3708
3597
|
if not silent_mode:
|
|
3709
|
-
print(self.bundle.get("phone_detected").format(
|
|
3598
|
+
print(self.bundle.get("phone_detected").format(maybe_key))
|
|
3710
3599
|
else:
|
|
3711
3600
|
self.logger.warning(
|
|
3712
|
-
f"Autodetected search key PHONE in column {
|
|
3713
|
-
"But not used because not registered user"
|
|
3601
|
+
f"Autodetected search key PHONE in column {maybe_key}. But not used because not registered user"
|
|
3714
3602
|
)
|
|
3715
3603
|
if not silent_mode:
|
|
3716
3604
|
print(self.bundle.get("phone_detected_not_registered"))
|
|
@@ -3805,7 +3693,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3805
3693
|
def sample(inp, sample_index):
|
|
3806
3694
|
if _num_samples(inp) <= 1000:
|
|
3807
3695
|
return inp
|
|
3808
|
-
if isinstance(inp, pd.DataFrame
|
|
3696
|
+
if isinstance(inp, (pd.DataFrame, pd.Series)):
|
|
3809
3697
|
return inp.sample(n=1000, random_state=random_state)
|
|
3810
3698
|
if isinstance(inp, np.ndarray):
|
|
3811
3699
|
return inp[sample_index]
|
upgini/http.py
CHANGED
|
@@ -22,6 +22,7 @@ from pydantic import BaseModel
|
|
|
22
22
|
from pythonjsonlogger import jsonlogger
|
|
23
23
|
from requests.exceptions import RequestException
|
|
24
24
|
|
|
25
|
+
from upgini.__about__ import __version__
|
|
25
26
|
from upgini.errors import (
|
|
26
27
|
HttpError,
|
|
27
28
|
UnauthorizedError,
|
|
@@ -38,17 +39,17 @@ from upgini.metadata import (
|
|
|
38
39
|
from upgini.resource_bundle import bundle
|
|
39
40
|
from upgini.utils.track_info import get_track_metrics
|
|
40
41
|
|
|
41
|
-
try:
|
|
42
|
-
|
|
42
|
+
# try:
|
|
43
|
+
# from importlib.metadata import version # type: ignore
|
|
43
44
|
|
|
44
|
-
|
|
45
|
-
except ImportError:
|
|
46
|
-
|
|
47
|
-
|
|
45
|
+
# __version__ = version("upgini")
|
|
46
|
+
# except ImportError:
|
|
47
|
+
# try:
|
|
48
|
+
# from importlib_metadata import version # type: ignore
|
|
48
49
|
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
50
|
+
# __version__ = version("upgini")
|
|
51
|
+
# except ImportError:
|
|
52
|
+
# __version__ = "Upgini wasn't installed"
|
|
52
53
|
|
|
53
54
|
UPGINI_URL: str = "UPGINI_URL"
|
|
54
55
|
UPGINI_API_KEY: str = "UPGINI_API_KEY"
|
|
@@ -925,7 +926,7 @@ def is_demo_api_key(api_token: Optional[str]) -> bool:
|
|
|
925
926
|
return api_token is None or api_token == "" or api_token == DEMO_API_KEY
|
|
926
927
|
|
|
927
928
|
|
|
928
|
-
@lru_cache
|
|
929
|
+
@lru_cache
|
|
929
930
|
def _get_rest_client(
|
|
930
931
|
backend_url: str, api_token: str, client_ip: Optional[str] = None, client_visitorid: Optional[str] = None
|
|
931
932
|
) -> _RestClient:
|