upgini 1.1.277__py3-none-any.whl → 1.1.278a1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- upgini/dataset.py +11 -2
- upgini/features_enricher.py +210 -98
- upgini/metadata.py +10 -2
- upgini/metrics.py +1 -1
- upgini/resource_bundle/strings.properties +1 -0
- upgini/utils/base_search_key_detector.py +14 -12
- upgini/utils/datetime_utils.py +2 -2
- upgini/utils/deduplicate_utils.py +11 -1
- upgini/utils/email_utils.py +5 -0
- {upgini-1.1.277.dist-info → upgini-1.1.278a1.dist-info}/METADATA +1 -1
- {upgini-1.1.277.dist-info → upgini-1.1.278a1.dist-info}/RECORD +14 -14
- {upgini-1.1.277.dist-info → upgini-1.1.278a1.dist-info}/LICENSE +0 -0
- {upgini-1.1.277.dist-info → upgini-1.1.278a1.dist-info}/WHEEL +0 -0
- {upgini-1.1.277.dist-info → upgini-1.1.278a1.dist-info}/top_level.txt +0 -0
upgini/dataset.py
CHANGED
|
@@ -23,7 +23,9 @@ from pandas.api.types import (
|
|
|
23
23
|
from upgini.errors import ValidationError
|
|
24
24
|
from upgini.http import ProgressStage, SearchProgress, _RestClient
|
|
25
25
|
from upgini.metadata import (
|
|
26
|
+
ENTITY_SYSTEM_RECORD_ID,
|
|
26
27
|
EVAL_SET_INDEX,
|
|
28
|
+
SEARCH_KEY_UNNEST,
|
|
27
29
|
SYSTEM_COLUMNS,
|
|
28
30
|
SYSTEM_RECORD_ID,
|
|
29
31
|
TARGET,
|
|
@@ -79,6 +81,7 @@ class Dataset: # (pd.DataFrame):
|
|
|
79
81
|
path: Optional[str] = None,
|
|
80
82
|
meaning_types: Optional[Dict[str, FileColumnMeaningType]] = None,
|
|
81
83
|
search_keys: Optional[List[Tuple[str, ...]]] = None,
|
|
84
|
+
unnest_search_keys: Optional[Dict[str, str]] = None,
|
|
82
85
|
model_task_type: Optional[ModelTaskType] = None,
|
|
83
86
|
random_state: Optional[int] = None,
|
|
84
87
|
rest_client: Optional[_RestClient] = None,
|
|
@@ -113,6 +116,7 @@ class Dataset: # (pd.DataFrame):
|
|
|
113
116
|
self.description = description
|
|
114
117
|
self.meaning_types = meaning_types
|
|
115
118
|
self.search_keys = search_keys
|
|
119
|
+
self.unnest_search_keys = unnest_search_keys
|
|
116
120
|
self.ignore_columns = []
|
|
117
121
|
self.hierarchical_group_keys = []
|
|
118
122
|
self.hierarchical_subgroup_keys = []
|
|
@@ -172,7 +176,7 @@ class Dataset: # (pd.DataFrame):
|
|
|
172
176
|
new_columns = []
|
|
173
177
|
dup_counter = 0
|
|
174
178
|
for column in self.data.columns:
|
|
175
|
-
if column in [TARGET, EVAL_SET_INDEX, SYSTEM_RECORD_ID]:
|
|
179
|
+
if column in [TARGET, EVAL_SET_INDEX, SYSTEM_RECORD_ID, ENTITY_SYSTEM_RECORD_ID, SEARCH_KEY_UNNEST]:
|
|
176
180
|
self.columns_renaming[column] = column
|
|
177
181
|
new_columns.append(column)
|
|
178
182
|
continue
|
|
@@ -353,7 +357,9 @@ class Dataset: # (pd.DataFrame):
|
|
|
353
357
|
|
|
354
358
|
if is_string_dtype(self.data[postal_code]) or is_object_dtype(self.data[postal_code]):
|
|
355
359
|
try:
|
|
356
|
-
self.data[postal_code] =
|
|
360
|
+
self.data[postal_code] = (
|
|
361
|
+
self.data[postal_code].astype("string").astype("Float64").astype("Int64").astype("string")
|
|
362
|
+
)
|
|
357
363
|
except Exception:
|
|
358
364
|
pass
|
|
359
365
|
elif is_float_dtype(self.data[postal_code]):
|
|
@@ -803,6 +809,9 @@ class Dataset: # (pd.DataFrame):
|
|
|
803
809
|
meaningType=meaning_type,
|
|
804
810
|
minMaxValues=min_max_values,
|
|
805
811
|
)
|
|
812
|
+
if self.unnest_search_keys and column_meta.originalName in self.unnest_search_keys:
|
|
813
|
+
column_meta.isUnnest = True
|
|
814
|
+
column_meta.unnestKeyNames = self.unnest_search_keys[column_meta.originalName]
|
|
806
815
|
|
|
807
816
|
columns.append(column_meta)
|
|
808
817
|
|
upgini/features_enricher.py
CHANGED
|
@@ -11,6 +11,7 @@ import sys
|
|
|
11
11
|
import tempfile
|
|
12
12
|
import time
|
|
13
13
|
import uuid
|
|
14
|
+
from collections import Counter
|
|
14
15
|
from dataclasses import dataclass
|
|
15
16
|
from threading import Thread
|
|
16
17
|
from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union
|
|
@@ -45,9 +46,11 @@ from upgini.mdc import MDC
|
|
|
45
46
|
from upgini.metadata import (
|
|
46
47
|
COUNTRY,
|
|
47
48
|
DEFAULT_INDEX,
|
|
49
|
+
ENTITY_SYSTEM_RECORD_ID,
|
|
48
50
|
EVAL_SET_INDEX,
|
|
49
51
|
ORIGINAL_INDEX,
|
|
50
52
|
RENAMED_INDEX,
|
|
53
|
+
SEARCH_KEY_UNNEST,
|
|
51
54
|
SORT_ID,
|
|
52
55
|
SYSTEM_RECORD_ID,
|
|
53
56
|
TARGET,
|
|
@@ -248,7 +251,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
248
251
|
self.__cached_sampled_datasets: Optional[Tuple[pd.DataFrame, pd.DataFrame, pd.Series, Dict, Dict]] = None
|
|
249
252
|
|
|
250
253
|
validate_version(self.logger)
|
|
251
|
-
self.search_keys = search_keys or
|
|
254
|
+
self.search_keys = search_keys or {}
|
|
252
255
|
self.country_code = country_code
|
|
253
256
|
self.__validate_search_keys(search_keys, search_id)
|
|
254
257
|
self.model_task_type = model_task_type
|
|
@@ -1188,7 +1191,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1188
1191
|
email_column = self._get_email_column(search_keys)
|
|
1189
1192
|
hem_column = self._get_hem_column(search_keys)
|
|
1190
1193
|
if email_column:
|
|
1191
|
-
converter = EmailSearchKeyConverter(email_column, hem_column, search_keys, self.logger)
|
|
1194
|
+
converter = EmailSearchKeyConverter(email_column, hem_column, search_keys, [], self.logger)
|
|
1192
1195
|
extended_X = converter.convert(extended_X)
|
|
1193
1196
|
generated_features.extend(converter.generated_features)
|
|
1194
1197
|
if (
|
|
@@ -1404,7 +1407,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1404
1407
|
fitting_enriched_X[col].astype("string").str.replace(",", ".").astype(np.float64)
|
|
1405
1408
|
)
|
|
1406
1409
|
|
|
1407
|
-
fitting_eval_set_dict =
|
|
1410
|
+
fitting_eval_set_dict = {}
|
|
1408
1411
|
for idx, eval_tuple in eval_set_sampled_dict.items():
|
|
1409
1412
|
eval_X_sampled, enriched_eval_X, eval_y_sampled = eval_tuple
|
|
1410
1413
|
eval_X_sorted, eval_y_sorted = self._sort_by_system_record_id(eval_X_sampled, eval_y_sampled, self.cv)
|
|
@@ -1516,7 +1519,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1516
1519
|
def __sample_only_input(
|
|
1517
1520
|
self, validated_X: pd.DataFrame, validated_y: pd.Series, eval_set: Optional[List[tuple]], is_demo_dataset: bool
|
|
1518
1521
|
) -> _SampledDataForMetrics:
|
|
1519
|
-
eval_set_sampled_dict =
|
|
1522
|
+
eval_set_sampled_dict = {}
|
|
1520
1523
|
|
|
1521
1524
|
df = validated_X.copy()
|
|
1522
1525
|
df[TARGET] = validated_y
|
|
@@ -1542,7 +1545,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1542
1545
|
df = df.sample(n=sample_rows, random_state=self.random_state)
|
|
1543
1546
|
|
|
1544
1547
|
df_extended, search_keys = self._extend_x(df, is_demo_dataset)
|
|
1545
|
-
df_extended = self.__add_fit_system_record_id(df_extended,
|
|
1548
|
+
df_extended = self.__add_fit_system_record_id(df_extended, {}, search_keys)
|
|
1546
1549
|
|
|
1547
1550
|
train_df = df_extended.query(f"{EVAL_SET_INDEX} == 0") if eval_set is not None else df_extended
|
|
1548
1551
|
X_sampled = train_df.drop(columns=[TARGET, EVAL_SET_INDEX], errors="ignore")
|
|
@@ -1566,7 +1569,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1566
1569
|
trace_id: str,
|
|
1567
1570
|
remove_outliers_calc_metrics: Optional[bool],
|
|
1568
1571
|
) -> _SampledDataForMetrics:
|
|
1569
|
-
eval_set_sampled_dict =
|
|
1572
|
+
eval_set_sampled_dict = {}
|
|
1570
1573
|
search_keys = self.fit_search_keys
|
|
1571
1574
|
|
|
1572
1575
|
rows_to_drop = None
|
|
@@ -1640,7 +1643,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1640
1643
|
progress_bar: Optional[ProgressBar],
|
|
1641
1644
|
progress_callback: Optional[Callable[[SearchProgress], Any]],
|
|
1642
1645
|
) -> _SampledDataForMetrics:
|
|
1643
|
-
eval_set_sampled_dict =
|
|
1646
|
+
eval_set_sampled_dict = {}
|
|
1644
1647
|
if eval_set is not None:
|
|
1645
1648
|
self.logger.info("Transform with eval_set")
|
|
1646
1649
|
# concatenate X and eval_set with eval_set_index
|
|
@@ -1662,7 +1665,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1662
1665
|
self.logger.info(f"Downsampling from {num_samples} to {Dataset.FIT_SAMPLE_WITH_EVAL_SET_ROWS}")
|
|
1663
1666
|
df = df.sample(n=Dataset.FIT_SAMPLE_WITH_EVAL_SET_ROWS, random_state=self.random_state)
|
|
1664
1667
|
|
|
1665
|
-
eval_set_sampled_dict =
|
|
1668
|
+
eval_set_sampled_dict = {}
|
|
1666
1669
|
|
|
1667
1670
|
tmp_target_name = "__target"
|
|
1668
1671
|
df = df.rename(columns={TARGET: tmp_target_name})
|
|
@@ -1925,11 +1928,38 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1925
1928
|
self.logger.info("Input dataset hasn't date column")
|
|
1926
1929
|
if self.add_date_if_missing:
|
|
1927
1930
|
df = self._add_current_date_as_key(df, search_keys, self.logger, self.bundle)
|
|
1931
|
+
|
|
1932
|
+
# Don't pass all features in backend on transform
|
|
1933
|
+
original_features_for_transform = []
|
|
1934
|
+
runtime_parameters = self._get_copy_of_runtime_parameters()
|
|
1935
|
+
features_not_to_pass = [column for column in df.columns if column not in search_keys.keys()]
|
|
1936
|
+
if len(features_not_to_pass) > 0:
|
|
1937
|
+
# Pass only features that need for transform
|
|
1938
|
+
features_for_transform = self._search_task.get_features_for_transform()
|
|
1939
|
+
if features_for_transform is not None and len(features_for_transform) > 0:
|
|
1940
|
+
file_metadata = self._search_task.get_file_metadata(trace_id)
|
|
1941
|
+
original_features_for_transform = [
|
|
1942
|
+
c.originalName or c.name for c in file_metadata.columns if c.name in features_for_transform
|
|
1943
|
+
]
|
|
1944
|
+
|
|
1945
|
+
runtime_parameters.properties["features_for_embeddings"] = ",".join(features_for_transform)
|
|
1946
|
+
|
|
1947
|
+
columns_for_system_record_id = sorted(list(search_keys.keys()) + (original_features_for_transform))
|
|
1948
|
+
|
|
1949
|
+
df[ENTITY_SYSTEM_RECORD_ID] = pd.util.hash_pandas_object(
|
|
1950
|
+
df[columns_for_system_record_id], index=False
|
|
1951
|
+
).astype("Float64")
|
|
1952
|
+
|
|
1953
|
+
# Explode multiple search keys
|
|
1954
|
+
df, unnest_search_keys = self._explode_multiple_search_keys(df, search_keys)
|
|
1955
|
+
|
|
1928
1956
|
email_column = self._get_email_column(search_keys)
|
|
1929
1957
|
hem_column = self._get_hem_column(search_keys)
|
|
1930
1958
|
email_converted_to_hem = False
|
|
1931
1959
|
if email_column:
|
|
1932
|
-
converter = EmailSearchKeyConverter(
|
|
1960
|
+
converter = EmailSearchKeyConverter(
|
|
1961
|
+
email_column, hem_column, search_keys, list(unnest_search_keys.keys()), self.logger
|
|
1962
|
+
)
|
|
1933
1963
|
df = converter.convert(df)
|
|
1934
1964
|
generated_features.extend(converter.generated_features)
|
|
1935
1965
|
email_converted_to_hem = converter.email_converted_to_hem
|
|
@@ -1943,30 +1973,21 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1943
1973
|
generated_features = [f for f in generated_features if f in self.fit_generated_features]
|
|
1944
1974
|
|
|
1945
1975
|
meaning_types = {col: key.value for col, key in search_keys.items()}
|
|
1946
|
-
non_keys_columns = [column for column in df.columns if column not in search_keys.keys()]
|
|
1976
|
+
# non_keys_columns = [column for column in df.columns if column not in search_keys.keys()]
|
|
1977
|
+
for col in original_features_for_transform:
|
|
1978
|
+
meaning_types[col] = FileColumnMeaningType.FEATURE
|
|
1979
|
+
features_not_to_pass = [column for column in features_not_to_pass if column not in search_keys.keys()]
|
|
1947
1980
|
|
|
1948
1981
|
if email_converted_to_hem:
|
|
1949
|
-
|
|
1950
|
-
|
|
1951
|
-
# Don't pass features in backend on transform
|
|
1952
|
-
original_features_for_transform = None
|
|
1953
|
-
runtime_parameters = self._get_copy_of_runtime_parameters()
|
|
1954
|
-
if len(non_keys_columns) > 0:
|
|
1955
|
-
# Pass only features that need for transform
|
|
1956
|
-
features_for_transform = self._search_task.get_features_for_transform()
|
|
1957
|
-
if features_for_transform is not None and len(features_for_transform) > 0:
|
|
1958
|
-
file_metadata = self._search_task.get_file_metadata(trace_id)
|
|
1959
|
-
original_features_for_transform = [
|
|
1960
|
-
c.originalName or c.name for c in file_metadata.columns if c.name in features_for_transform
|
|
1961
|
-
]
|
|
1962
|
-
non_keys_columns = [c for c in non_keys_columns if c not in original_features_for_transform]
|
|
1982
|
+
features_not_to_pass.append(email_column)
|
|
1963
1983
|
|
|
1964
|
-
|
|
1984
|
+
features_not_to_pass = [c for c in features_not_to_pass if c not in original_features_for_transform]
|
|
1985
|
+
columns_for_system_record_id = sorted(list(search_keys.keys()) + (original_features_for_transform))
|
|
1965
1986
|
|
|
1966
1987
|
if add_fit_system_record_id:
|
|
1967
|
-
df = self.__add_fit_system_record_id(df,
|
|
1988
|
+
df = self.__add_fit_system_record_id(df, {}, search_keys)
|
|
1968
1989
|
df = df.rename(columns={SYSTEM_RECORD_ID: SORT_ID})
|
|
1969
|
-
|
|
1990
|
+
features_not_to_pass.append(SORT_ID)
|
|
1970
1991
|
|
|
1971
1992
|
columns_for_system_record_id = sorted(list(search_keys.keys()) + (original_features_for_transform or []))
|
|
1972
1993
|
|
|
@@ -1974,16 +1995,19 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1974
1995
|
"Float64"
|
|
1975
1996
|
)
|
|
1976
1997
|
meaning_types[SYSTEM_RECORD_ID] = FileColumnMeaningType.SYSTEM_RECORD_ID
|
|
1998
|
+
meaning_types[ENTITY_SYSTEM_RECORD_ID] = FileColumnMeaningType.ENTITY_SYSTEM_RECORD_ID
|
|
1999
|
+
if SEARCH_KEY_UNNEST in df.columns:
|
|
2000
|
+
meaning_types[SEARCH_KEY_UNNEST] = FileColumnMeaningType.UNNEST_KEY
|
|
1977
2001
|
|
|
1978
2002
|
df = df.reset_index(drop=True)
|
|
1979
|
-
system_columns_with_original_index = [SYSTEM_RECORD_ID] + generated_features
|
|
2003
|
+
system_columns_with_original_index = [SYSTEM_RECORD_ID, ENTITY_SYSTEM_RECORD_ID] + generated_features
|
|
1980
2004
|
if add_fit_system_record_id:
|
|
1981
2005
|
system_columns_with_original_index.append(SORT_ID)
|
|
1982
2006
|
df_with_original_index = df[system_columns_with_original_index].copy()
|
|
1983
2007
|
|
|
1984
2008
|
combined_search_keys = combine_search_keys(search_keys.keys())
|
|
1985
2009
|
|
|
1986
|
-
df_without_features = df.drop(columns=
|
|
2010
|
+
df_without_features = df.drop(columns=features_not_to_pass)
|
|
1987
2011
|
|
|
1988
2012
|
df_without_features = clean_full_duplicates(
|
|
1989
2013
|
df_without_features, self.logger, silent=silent_mode, bundle=self.bundle
|
|
@@ -1995,12 +2019,13 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1995
2019
|
dataset = Dataset(
|
|
1996
2020
|
"sample_" + str(uuid.uuid4()),
|
|
1997
2021
|
df=df_without_features,
|
|
2022
|
+
meaning_types=meaning_types,
|
|
2023
|
+
search_keys=combined_search_keys,
|
|
2024
|
+
unnest_search_keys=unnest_search_keys,
|
|
1998
2025
|
date_format=self.date_format,
|
|
1999
2026
|
rest_client=self.rest_client,
|
|
2000
2027
|
logger=self.logger,
|
|
2001
2028
|
)
|
|
2002
|
-
dataset.meaning_types = meaning_types
|
|
2003
|
-
dataset.search_keys = combined_search_keys
|
|
2004
2029
|
if email_converted_to_hem:
|
|
2005
2030
|
dataset.ignore_columns = [email_column]
|
|
2006
2031
|
|
|
@@ -2139,6 +2164,14 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2139
2164
|
|
|
2140
2165
|
key_types = search_keys.values()
|
|
2141
2166
|
|
|
2167
|
+
# Multiple search keys allowed only for PHONE, IP, POSTAL_CODE, EMAIL, HEM
|
|
2168
|
+
multi_keys = [key for key, count in Counter(key_types).items() if count > 1]
|
|
2169
|
+
for multi_key in multi_keys:
|
|
2170
|
+
if multi_key not in [SearchKey.PHONE, SearchKey.IP, SearchKey.POSTAL_CODE, SearchKey.EMAIL, SearchKey.HEM]:
|
|
2171
|
+
msg = self.bundle.get("unsupported_multi_key").format(multi_key)
|
|
2172
|
+
self.logger.warning(msg)
|
|
2173
|
+
raise ValidationError(msg)
|
|
2174
|
+
|
|
2142
2175
|
if SearchKey.DATE in key_types and SearchKey.DATETIME in key_types:
|
|
2143
2176
|
msg = self.bundle.get("date_and_datetime_simultanious")
|
|
2144
2177
|
self.logger.warning(msg)
|
|
@@ -2154,11 +2187,11 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2154
2187
|
self.logger.warning(msg)
|
|
2155
2188
|
raise ValidationError(msg)
|
|
2156
2189
|
|
|
2157
|
-
for key_type in SearchKey.__members__.values():
|
|
2158
|
-
|
|
2159
|
-
|
|
2160
|
-
|
|
2161
|
-
|
|
2190
|
+
# for key_type in SearchKey.__members__.values():
|
|
2191
|
+
# if key_type != SearchKey.CUSTOM_KEY and list(key_types).count(key_type) > 1:
|
|
2192
|
+
# msg = self.bundle.get("multiple_search_key").format(key_type)
|
|
2193
|
+
# self.logger.warning(msg)
|
|
2194
|
+
# raise ValidationError(msg)
|
|
2162
2195
|
|
|
2163
2196
|
# non_personal_keys = set(SearchKey.__members__.values()) - set(SearchKey.personal_keys())
|
|
2164
2197
|
# if (
|
|
@@ -2296,14 +2329,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2296
2329
|
self.logger.info("Input dataset hasn't date column")
|
|
2297
2330
|
if self.add_date_if_missing:
|
|
2298
2331
|
df = self._add_current_date_as_key(df, self.fit_search_keys, self.logger, self.bundle)
|
|
2299
|
-
email_column = self._get_email_column(self.fit_search_keys)
|
|
2300
|
-
hem_column = self._get_hem_column(self.fit_search_keys)
|
|
2301
|
-
email_converted_to_hem = False
|
|
2302
|
-
if email_column:
|
|
2303
|
-
converter = EmailSearchKeyConverter(email_column, hem_column, self.fit_search_keys, self.logger)
|
|
2304
|
-
df = converter.convert(df)
|
|
2305
|
-
self.fit_generated_features.extend(converter.generated_features)
|
|
2306
|
-
email_converted_to_hem = converter.email_converted_to_hem
|
|
2307
2332
|
if (
|
|
2308
2333
|
self.detect_missing_search_keys
|
|
2309
2334
|
and list(self.fit_search_keys.values()) == [SearchKey.DATE]
|
|
@@ -2312,7 +2337,37 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2312
2337
|
converter = IpToCountrySearchKeyConverter(self.fit_search_keys, self.logger)
|
|
2313
2338
|
df = converter.convert(df)
|
|
2314
2339
|
|
|
2340
|
+
# Explode multiple search keys
|
|
2315
2341
|
non_feature_columns = [self.TARGET_NAME, EVAL_SET_INDEX] + list(self.fit_search_keys.keys())
|
|
2342
|
+
meaning_types = {
|
|
2343
|
+
**{col: key.value for col, key in self.fit_search_keys.items()},
|
|
2344
|
+
**{str(c): FileColumnMeaningType.FEATURE for c in df.columns if c not in non_feature_columns},
|
|
2345
|
+
}
|
|
2346
|
+
meaning_types[self.TARGET_NAME] = FileColumnMeaningType.TARGET
|
|
2347
|
+
if eval_set is not None and len(eval_set) > 0:
|
|
2348
|
+
meaning_types[EVAL_SET_INDEX] = FileColumnMeaningType.EVAL_SET_INDEX
|
|
2349
|
+
df = self.__add_fit_system_record_id(df, meaning_types, self.fit_search_keys, ENTITY_SYSTEM_RECORD_ID)
|
|
2350
|
+
|
|
2351
|
+
# TODO check that this is correct for enrichment
|
|
2352
|
+
self.df_with_original_index = df.copy()
|
|
2353
|
+
|
|
2354
|
+
df, unnest_search_keys = self._explode_multiple_search_keys(df, self.fit_search_keys)
|
|
2355
|
+
|
|
2356
|
+
# Convert EMAIL to HEM after unnesting to do it only with one column
|
|
2357
|
+
email_column = self._get_email_column(self.fit_search_keys)
|
|
2358
|
+
hem_column = self._get_hem_column(self.fit_search_keys)
|
|
2359
|
+
email_converted_to_hem = False
|
|
2360
|
+
if email_column:
|
|
2361
|
+
converter = EmailSearchKeyConverter(
|
|
2362
|
+
email_column, hem_column, self.fit_search_keys, list(unnest_search_keys.keys()), self.logger
|
|
2363
|
+
)
|
|
2364
|
+
df = converter.convert(df)
|
|
2365
|
+
self.fit_generated_features.extend(converter.generated_features)
|
|
2366
|
+
email_converted_to_hem = converter.email_converted_to_hem
|
|
2367
|
+
|
|
2368
|
+
non_feature_columns = [self.TARGET_NAME, EVAL_SET_INDEX, ENTITY_SYSTEM_RECORD_ID, SEARCH_KEY_UNNEST] + list(
|
|
2369
|
+
self.fit_search_keys.keys()
|
|
2370
|
+
)
|
|
2316
2371
|
if email_converted_to_hem:
|
|
2317
2372
|
non_feature_columns.append(email_column)
|
|
2318
2373
|
if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
|
|
@@ -2336,12 +2391,14 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2336
2391
|
**{str(c): FileColumnMeaningType.FEATURE for c in df.columns if c not in non_feature_columns},
|
|
2337
2392
|
}
|
|
2338
2393
|
meaning_types[self.TARGET_NAME] = FileColumnMeaningType.TARGET
|
|
2394
|
+
meaning_types[ENTITY_SYSTEM_RECORD_ID] = FileColumnMeaningType.ENTITY_SYSTEM_RECORD_ID
|
|
2395
|
+
if SEARCH_KEY_UNNEST in df.columns:
|
|
2396
|
+
meaning_types[SEARCH_KEY_UNNEST] = FileColumnMeaningType.UNNEST_KEY
|
|
2339
2397
|
if eval_set is not None and len(eval_set) > 0:
|
|
2340
2398
|
meaning_types[EVAL_SET_INDEX] = FileColumnMeaningType.EVAL_SET_INDEX
|
|
2341
2399
|
|
|
2342
|
-
df = self.__add_fit_system_record_id(df, meaning_types, self.fit_search_keys)
|
|
2400
|
+
df = self.__add_fit_system_record_id(df, meaning_types, self.fit_search_keys, SYSTEM_RECORD_ID)
|
|
2343
2401
|
|
|
2344
|
-
self.df_with_original_index = df.copy()
|
|
2345
2402
|
df = df.reset_index(drop=True).sort_values(by=SYSTEM_RECORD_ID).reset_index(drop=True)
|
|
2346
2403
|
|
|
2347
2404
|
combined_search_keys = combine_search_keys(self.fit_search_keys.keys())
|
|
@@ -2349,14 +2406,15 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2349
2406
|
dataset = Dataset(
|
|
2350
2407
|
"tds_" + str(uuid.uuid4()),
|
|
2351
2408
|
df=df,
|
|
2409
|
+
meaning_types=meaning_types,
|
|
2410
|
+
search_keys=combined_search_keys,
|
|
2411
|
+
unnest_search_keys=unnest_search_keys,
|
|
2352
2412
|
model_task_type=model_task_type,
|
|
2353
2413
|
date_format=self.date_format,
|
|
2354
2414
|
random_state=self.random_state,
|
|
2355
2415
|
rest_client=self.rest_client,
|
|
2356
2416
|
logger=self.logger,
|
|
2357
2417
|
)
|
|
2358
|
-
dataset.meaning_types = meaning_types
|
|
2359
|
-
dataset.search_keys = combined_search_keys
|
|
2360
2418
|
if email_converted_to_hem:
|
|
2361
2419
|
dataset.ignore_columns = [email_column]
|
|
2362
2420
|
|
|
@@ -2905,15 +2963,19 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2905
2963
|
|
|
2906
2964
|
@staticmethod
|
|
2907
2965
|
def _get_email_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
|
|
2908
|
-
for col, t in search_keys.items()
|
|
2909
|
-
|
|
2910
|
-
|
|
2966
|
+
cols = [col for col, t in search_keys.items() if t == SearchKey.EMAIL]
|
|
2967
|
+
if len(cols) > 1:
|
|
2968
|
+
raise Exception("More than one email column found after unnest")
|
|
2969
|
+
if len(cols) == 1:
|
|
2970
|
+
return cols[0]
|
|
2911
2971
|
|
|
2912
2972
|
@staticmethod
|
|
2913
2973
|
def _get_hem_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
|
|
2914
|
-
for col, t in search_keys.items()
|
|
2915
|
-
|
|
2916
|
-
|
|
2974
|
+
cols = [col for col, t in search_keys.items() if t == SearchKey.HEM]
|
|
2975
|
+
if len(cols) > 1:
|
|
2976
|
+
raise Exception("More than one hem column found after unnest")
|
|
2977
|
+
if len(cols) == 1:
|
|
2978
|
+
return cols[0]
|
|
2917
2979
|
|
|
2918
2980
|
@staticmethod
|
|
2919
2981
|
def _get_phone_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
|
|
@@ -2921,8 +2983,44 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2921
2983
|
if t == SearchKey.PHONE:
|
|
2922
2984
|
return col
|
|
2923
2985
|
|
|
2986
|
+
def _explode_multiple_search_keys(
|
|
2987
|
+
self, df: pd.DataFrame, search_keys: Dict[str, SearchKey]
|
|
2988
|
+
) -> Tuple[pd.DataFrame, Dict[str, List[str]]]:
|
|
2989
|
+
# find groups of multiple search keys
|
|
2990
|
+
search_key_names_by_type: Dict[SearchKey, str] = {}
|
|
2991
|
+
for key_name, key_type in search_keys.items():
|
|
2992
|
+
search_key_names_by_type[key_type] = search_key_names_by_type.get(key_type, []) + [key_name]
|
|
2993
|
+
search_key_names_by_type = {
|
|
2994
|
+
key_type: key_names for key_type, key_names in search_key_names_by_type.items() if len(key_names) > 1
|
|
2995
|
+
}
|
|
2996
|
+
if len(search_key_names_by_type) == 0:
|
|
2997
|
+
return df, {}
|
|
2998
|
+
|
|
2999
|
+
multiple_keys_columns = [col for cols in search_key_names_by_type.values() for col in cols]
|
|
3000
|
+
other_columns = [col for col in df.columns if col not in multiple_keys_columns]
|
|
3001
|
+
exploded_dfs = []
|
|
3002
|
+
unnest_search_keys = {}
|
|
3003
|
+
|
|
3004
|
+
for key_type, key_names in search_key_names_by_type.items():
|
|
3005
|
+
new_search_key = f"upgini_{key_type.name.lower()}_unnest"
|
|
3006
|
+
exploded_df = pd.melt(
|
|
3007
|
+
df, id_vars=other_columns, value_vars=key_names, var_name=SEARCH_KEY_UNNEST, value_name=new_search_key
|
|
3008
|
+
)
|
|
3009
|
+
exploded_dfs.append(exploded_df)
|
|
3010
|
+
for old_key in key_names:
|
|
3011
|
+
del search_keys[old_key]
|
|
3012
|
+
search_keys[new_search_key] = key_type
|
|
3013
|
+
unnest_search_keys[new_search_key] = key_names
|
|
3014
|
+
|
|
3015
|
+
df = pd.concat(exploded_dfs, ignore_index=True)
|
|
3016
|
+
return df, unnest_search_keys
|
|
3017
|
+
|
|
2924
3018
|
def __add_fit_system_record_id(
|
|
2925
|
-
self,
|
|
3019
|
+
self,
|
|
3020
|
+
df: pd.DataFrame,
|
|
3021
|
+
meaning_types: Dict[str, FileColumnMeaningType],
|
|
3022
|
+
search_keys: Dict[str, SearchKey],
|
|
3023
|
+
id_name: str,
|
|
2926
3024
|
) -> pd.DataFrame:
|
|
2927
3025
|
# save original order or rows
|
|
2928
3026
|
original_index_name = df.index.name
|
|
@@ -2971,14 +3069,18 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2971
3069
|
|
|
2972
3070
|
df = df.reset_index(drop=True).reset_index()
|
|
2973
3071
|
# system_record_id saves correct order for fit
|
|
2974
|
-
df = df.rename(columns={DEFAULT_INDEX:
|
|
3072
|
+
df = df.rename(columns={DEFAULT_INDEX: id_name})
|
|
2975
3073
|
|
|
2976
3074
|
# return original order
|
|
2977
3075
|
df = df.set_index(ORIGINAL_INDEX)
|
|
2978
3076
|
df.index.name = original_index_name
|
|
2979
3077
|
df = df.sort_values(by=original_order_name).drop(columns=original_order_name)
|
|
2980
3078
|
|
|
2981
|
-
meaning_types[
|
|
3079
|
+
meaning_types[id_name] = (
|
|
3080
|
+
FileColumnMeaningType.SYSTEM_RECORD_ID
|
|
3081
|
+
if id_name == SYSTEM_RECORD_ID
|
|
3082
|
+
else FileColumnMeaningType.ENTITY_SYSTEM_RECORD_ID
|
|
3083
|
+
)
|
|
2982
3084
|
return df
|
|
2983
3085
|
|
|
2984
3086
|
def __correct_target(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
@@ -3033,7 +3135,11 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3033
3135
|
)
|
|
3034
3136
|
|
|
3035
3137
|
comparing_columns = X.columns if is_transform else df_with_original_index.columns
|
|
3036
|
-
dup_features = [
|
|
3138
|
+
dup_features = [
|
|
3139
|
+
c
|
|
3140
|
+
for c in comparing_columns
|
|
3141
|
+
if c in result_features.columns and c not in [SYSTEM_RECORD_ID, ENTITY_SYSTEM_RECORD_ID]
|
|
3142
|
+
]
|
|
3037
3143
|
if len(dup_features) > 0:
|
|
3038
3144
|
self.logger.warning(f"X contain columns with same name as returned from backend: {dup_features}")
|
|
3039
3145
|
raise ValidationError(self.bundle.get("returned_features_same_as_passed").format(dup_features))
|
|
@@ -3044,8 +3150,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3044
3150
|
result_features = pd.merge(
|
|
3045
3151
|
df_with_original_index,
|
|
3046
3152
|
result_features,
|
|
3047
|
-
|
|
3048
|
-
right_on=SYSTEM_RECORD_ID,
|
|
3153
|
+
on=ENTITY_SYSTEM_RECORD_ID,
|
|
3049
3154
|
how="left" if is_transform else "inner",
|
|
3050
3155
|
)
|
|
3051
3156
|
result_features = result_features.set_index(original_index_name or DEFAULT_INDEX)
|
|
@@ -3056,7 +3161,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3056
3161
|
result_features = result_features[~result_features[SYSTEM_RECORD_ID].isin(rows_to_drop[SYSTEM_RECORD_ID])]
|
|
3057
3162
|
self.logger.info(f"After dropping target outliers size: {len(result_features)}")
|
|
3058
3163
|
|
|
3059
|
-
result_eval_sets =
|
|
3164
|
+
result_eval_sets = {}
|
|
3060
3165
|
if not is_transform and EVAL_SET_INDEX in result_features.columns:
|
|
3061
3166
|
result_train_features = result_features.loc[result_features[EVAL_SET_INDEX] == 0].copy()
|
|
3062
3167
|
eval_set_indices = list(result_features[EVAL_SET_INDEX].unique())
|
|
@@ -3262,7 +3367,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3262
3367
|
if autofe_feature.op.is_vector:
|
|
3263
3368
|
continue
|
|
3264
3369
|
|
|
3265
|
-
description =
|
|
3370
|
+
description = {}
|
|
3266
3371
|
|
|
3267
3372
|
feature_meta = get_feature_by_name(autofe_feature.get_display_name(shorten=True))
|
|
3268
3373
|
if feature_meta is None:
|
|
@@ -3428,13 +3533,13 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3428
3533
|
self.warning_counter.increment()
|
|
3429
3534
|
|
|
3430
3535
|
if len(valid_search_keys) == 1:
|
|
3431
|
-
|
|
3432
|
-
|
|
3433
|
-
|
|
3434
|
-
|
|
3435
|
-
|
|
3436
|
-
|
|
3437
|
-
|
|
3536
|
+
key, value = list(valid_search_keys.items())[0]
|
|
3537
|
+
# Show warning for country only if country is the only key
|
|
3538
|
+
if x[key].nunique() == 1:
|
|
3539
|
+
msg = self.bundle.get("single_constant_search_key").format(value, x[key].values[0])
|
|
3540
|
+
print(msg)
|
|
3541
|
+
self.logger.warning(msg)
|
|
3542
|
+
self.warning_counter.increment()
|
|
3438
3543
|
|
|
3439
3544
|
self.logger.info(f"Prepared search keys: {valid_search_keys}")
|
|
3440
3545
|
|
|
@@ -3544,61 +3649,68 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3544
3649
|
def check_need_detect(search_key: SearchKey):
|
|
3545
3650
|
return not is_transform or search_key in self.fit_search_keys.values()
|
|
3546
3651
|
|
|
3547
|
-
if SearchKey.POSTAL_CODE not in search_keys.values() and check_need_detect(SearchKey.POSTAL_CODE):
|
|
3548
|
-
|
|
3549
|
-
|
|
3550
|
-
|
|
3551
|
-
|
|
3552
|
-
|
|
3652
|
+
# if SearchKey.POSTAL_CODE not in search_keys.values() and check_need_detect(SearchKey.POSTAL_CODE):
|
|
3653
|
+
if check_need_detect(SearchKey.POSTAL_CODE):
|
|
3654
|
+
maybe_keys = PostalCodeSearchKeyDetector().get_search_key_columns(sample, search_keys)
|
|
3655
|
+
if maybe_keys:
|
|
3656
|
+
new_keys = {key: SearchKey.POSTAL_CODE for key in maybe_keys}
|
|
3657
|
+
search_keys.update(new_keys)
|
|
3658
|
+
self.autodetected_search_keys.update(new_keys)
|
|
3659
|
+
self.logger.info(f"Autodetected search key POSTAL_CODE in column {maybe_keys}")
|
|
3553
3660
|
if not silent_mode:
|
|
3554
|
-
print(self.bundle.get("postal_code_detected").format(
|
|
3661
|
+
print(self.bundle.get("postal_code_detected").format(maybe_keys))
|
|
3555
3662
|
|
|
3556
3663
|
if (
|
|
3557
3664
|
SearchKey.COUNTRY not in search_keys.values()
|
|
3558
3665
|
and self.country_code is None
|
|
3559
3666
|
and check_need_detect(SearchKey.COUNTRY)
|
|
3560
3667
|
):
|
|
3561
|
-
maybe_key = CountrySearchKeyDetector().
|
|
3562
|
-
if maybe_key
|
|
3563
|
-
search_keys[maybe_key] = SearchKey.COUNTRY
|
|
3564
|
-
self.autodetected_search_keys[maybe_key] = SearchKey.COUNTRY
|
|
3668
|
+
maybe_key = CountrySearchKeyDetector().get_search_key_columns(sample, search_keys)
|
|
3669
|
+
if maybe_key:
|
|
3670
|
+
search_keys[maybe_key[0]] = SearchKey.COUNTRY
|
|
3671
|
+
self.autodetected_search_keys[maybe_key[0]] = SearchKey.COUNTRY
|
|
3565
3672
|
self.logger.info(f"Autodetected search key COUNTRY in column {maybe_key}")
|
|
3566
3673
|
if not silent_mode:
|
|
3567
3674
|
print(self.bundle.get("country_detected").format(maybe_key))
|
|
3568
3675
|
|
|
3569
3676
|
if (
|
|
3570
|
-
SearchKey.EMAIL not in search_keys.values()
|
|
3571
|
-
|
|
3677
|
+
# SearchKey.EMAIL not in search_keys.values()
|
|
3678
|
+
SearchKey.HEM not in search_keys.values()
|
|
3572
3679
|
and check_need_detect(SearchKey.HEM)
|
|
3573
3680
|
):
|
|
3574
|
-
|
|
3575
|
-
if
|
|
3681
|
+
maybe_keys = EmailSearchKeyDetector().get_search_key_columns(sample, search_keys)
|
|
3682
|
+
if maybe_keys:
|
|
3576
3683
|
if self.__is_registered or is_demo_dataset:
|
|
3577
|
-
|
|
3578
|
-
|
|
3579
|
-
self.
|
|
3684
|
+
new_keys = {key: SearchKey.EMAIL for key in maybe_keys}
|
|
3685
|
+
search_keys.update(new_keys)
|
|
3686
|
+
self.autodetected_search_keys.update(new_keys)
|
|
3687
|
+
self.logger.info(f"Autodetected search key EMAIL in column {maybe_keys}")
|
|
3580
3688
|
if not silent_mode:
|
|
3581
|
-
print(self.bundle.get("email_detected").format(
|
|
3689
|
+
print(self.bundle.get("email_detected").format(maybe_keys))
|
|
3582
3690
|
else:
|
|
3583
3691
|
self.logger.warning(
|
|
3584
|
-
f"Autodetected search key EMAIL in column {
|
|
3692
|
+
f"Autodetected search key EMAIL in column {maybe_keys}."
|
|
3693
|
+
" But not used because not registered user"
|
|
3585
3694
|
)
|
|
3586
3695
|
if not silent_mode:
|
|
3587
|
-
print(self.bundle.get("email_detected_not_registered").format(
|
|
3696
|
+
print(self.bundle.get("email_detected_not_registered").format(maybe_keys))
|
|
3588
3697
|
self.warning_counter.increment()
|
|
3589
3698
|
|
|
3590
|
-
if SearchKey.PHONE not in search_keys.values() and check_need_detect(SearchKey.PHONE):
|
|
3591
|
-
|
|
3592
|
-
|
|
3699
|
+
# if SearchKey.PHONE not in search_keys.values() and check_need_detect(SearchKey.PHONE):
|
|
3700
|
+
if check_need_detect(SearchKey.PHONE):
|
|
3701
|
+
maybe_keys = PhoneSearchKeyDetector().get_search_key_columns(sample, search_keys)
|
|
3702
|
+
if maybe_keys:
|
|
3593
3703
|
if self.__is_registered or is_demo_dataset:
|
|
3594
|
-
|
|
3595
|
-
|
|
3596
|
-
self.
|
|
3704
|
+
new_keys = {key: SearchKey.PHONE for key in maybe_keys}
|
|
3705
|
+
search_keys.update(new_keys)
|
|
3706
|
+
self.autodetected_search_keys.update(new_keys)
|
|
3707
|
+
self.logger.info(f"Autodetected search key PHONE in column {maybe_keys}")
|
|
3597
3708
|
if not silent_mode:
|
|
3598
|
-
print(self.bundle.get("phone_detected").format(
|
|
3709
|
+
print(self.bundle.get("phone_detected").format(maybe_keys))
|
|
3599
3710
|
else:
|
|
3600
3711
|
self.logger.warning(
|
|
3601
|
-
f"Autodetected search key PHONE in column {
|
|
3712
|
+
f"Autodetected search key PHONE in column {maybe_keys}. "
|
|
3713
|
+
"But not used because not registered user"
|
|
3602
3714
|
)
|
|
3603
3715
|
if not silent_mode:
|
|
3604
3716
|
print(self.bundle.get("phone_detected_not_registered"))
|
upgini/metadata.py
CHANGED
|
@@ -4,6 +4,8 @@ from typing import Dict, List, Optional, Set
|
|
|
4
4
|
from pydantic import BaseModel
|
|
5
5
|
|
|
6
6
|
SYSTEM_RECORD_ID = "system_record_id"
|
|
7
|
+
ENTITY_SYSTEM_RECORD_ID = "entity_system_record_id"
|
|
8
|
+
SEARCH_KEY_UNNEST = "search_key_unnest"
|
|
7
9
|
SORT_ID = "sort_id"
|
|
8
10
|
EVAL_SET_INDEX = "eval_set_index"
|
|
9
11
|
TARGET = "target"
|
|
@@ -11,7 +13,7 @@ COUNTRY = "country_iso_code"
|
|
|
11
13
|
RENAMED_INDEX = "index_col"
|
|
12
14
|
DEFAULT_INDEX = "index"
|
|
13
15
|
ORIGINAL_INDEX = "original_index"
|
|
14
|
-
SYSTEM_COLUMNS = {SYSTEM_RECORD_ID, EVAL_SET_INDEX, TARGET, COUNTRY
|
|
16
|
+
SYSTEM_COLUMNS = {SYSTEM_RECORD_ID, ENTITY_SYSTEM_RECORD_ID, SEARCH_KEY_UNNEST, EVAL_SET_INDEX, TARGET, COUNTRY}
|
|
15
17
|
|
|
16
18
|
|
|
17
19
|
class FileColumnMeaningType(Enum):
|
|
@@ -37,6 +39,8 @@ class FileColumnMeaningType(Enum):
|
|
|
37
39
|
POSTAL_CODE = "POSTAL_CODE"
|
|
38
40
|
SYSTEM_RECORD_ID = "SYSTEM_RECORD_ID"
|
|
39
41
|
EVAL_SET_INDEX = "EVAL_SET_INDEX"
|
|
42
|
+
ENTITY_SYSTEM_RECORD_ID = "ENTITY_SYSTEM_RECORD_ID"
|
|
43
|
+
UNNEST_KEY = "UNNEST_KEY"
|
|
40
44
|
|
|
41
45
|
|
|
42
46
|
class SearchKey(Enum):
|
|
@@ -182,6 +186,10 @@ class FileColumnMetadata(BaseModel):
|
|
|
182
186
|
meaningType: FileColumnMeaningType
|
|
183
187
|
minMaxValues: Optional[NumericInterval] = None
|
|
184
188
|
originalName: Optional[str]
|
|
189
|
+
# is this column contains keys from multiple key columns like msisdn1, msisdn2
|
|
190
|
+
isUnnest: bool = False
|
|
191
|
+
# list of original etalon key column names like msisdn1, msisdn2
|
|
192
|
+
unnestKeyNames: Optional[list[str]]
|
|
185
193
|
|
|
186
194
|
|
|
187
195
|
class FileMetadata(BaseModel):
|
|
@@ -276,7 +284,7 @@ class FeaturesFilter(BaseModel):
|
|
|
276
284
|
|
|
277
285
|
|
|
278
286
|
class RuntimeParameters(BaseModel):
|
|
279
|
-
properties: Dict[str, str] =
|
|
287
|
+
properties: Dict[str, str] = {}
|
|
280
288
|
|
|
281
289
|
|
|
282
290
|
class SearchCustomization(BaseModel):
|
upgini/metrics.py
CHANGED
|
@@ -357,7 +357,7 @@ class EstimatorWrapper:
|
|
|
357
357
|
"logger": logger,
|
|
358
358
|
}
|
|
359
359
|
if estimator is None:
|
|
360
|
-
params =
|
|
360
|
+
params = {}
|
|
361
361
|
# if metric_name.upper() in SUPPORTED_CATBOOST_METRICS:
|
|
362
362
|
# params["eval_metric"] = SUPPORTED_CATBOOST_METRICS[metric_name.upper()]
|
|
363
363
|
if target_type == ModelTaskType.MULTICLASS:
|
|
@@ -88,6 +88,7 @@ unsupported_search_key_type=Unsupported type of key in search_keys: {}
|
|
|
88
88
|
search_key_country_and_country_code=\nWARNING: SearchKey.COUNTRY and country_code parameter were passed simultaniously. Parameter country_code will be ignored
|
|
89
89
|
empty_search_key=Search key {} is empty. Please fill values or remove this search key
|
|
90
90
|
single_constant_search_key=\nWARNING: Constant value detected for the {} search key in the X dataframe: {}.\nThat search key will add constant features for different y values.\nPlease add extra search keys with non constant values, like the COUNTRY, POSTAL_CODE, DATE, PHONE NUMBER, EMAIL/HEM or IPv4
|
|
91
|
+
unsupported_multi_key=Search key {} cannot be used multiple times
|
|
91
92
|
unsupported_index_column=\nWARNING: Your column with name `index` was dropped because it's reserved name is booked for system needs.
|
|
92
93
|
date_string_without_format=Date column `{}` has string type, but date_format is not specified. Convert column to datetime type or pass date_format
|
|
93
94
|
invalid_date_format=Failed to parse date in column `{}`. Try to pass explicit date format in date_format argument of FeaturesEnricher constructor
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from typing import List
|
|
1
|
+
from typing import List
|
|
2
2
|
|
|
3
3
|
import pandas as pd
|
|
4
4
|
|
|
@@ -10,16 +10,18 @@ class BaseSearchKeyDetector:
|
|
|
10
10
|
def _is_search_key_by_values(self, column: pd.Series) -> bool:
|
|
11
11
|
raise NotImplementedError()
|
|
12
12
|
|
|
13
|
-
def
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
13
|
+
def _get_search_keys_by_name(self, column_names: List[str]) -> List[str]:
|
|
14
|
+
return [
|
|
15
|
+
column_name
|
|
16
|
+
for column_name in column_names
|
|
17
|
+
if self._is_search_key_by_name(column_name)
|
|
18
|
+
]
|
|
17
19
|
|
|
18
|
-
def
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
for column_name in df.columns:
|
|
20
|
+
def get_search_key_columns(self, df: pd.DataFrame, existing_search_keys: List[str]) -> List[str]:
|
|
21
|
+
other_columns = [col for col in df.columns if col not in existing_search_keys]
|
|
22
|
+
columns_by_names = self._get_search_keys_by_name(other_columns)
|
|
23
|
+
columns_by_values = []
|
|
24
|
+
for column_name in other_columns:
|
|
24
25
|
if self._is_search_key_by_values(df[column_name]):
|
|
25
|
-
|
|
26
|
+
columns_by_values.append(column_name)
|
|
27
|
+
return list(set(columns_by_names + columns_by_values))
|
upgini/utils/datetime_utils.py
CHANGED
|
@@ -126,9 +126,9 @@ class DateTimeSearchKeyConverter:
|
|
|
126
126
|
df.drop(columns=seconds, inplace=True)
|
|
127
127
|
|
|
128
128
|
if keep_time:
|
|
129
|
-
df[self.DATETIME_COL] = df[self.date_column].
|
|
129
|
+
df[self.DATETIME_COL] = df[self.date_column].astype(np.int64) // 1_000_000
|
|
130
130
|
df[self.DATETIME_COL] = df[self.DATETIME_COL].apply(self._int_to_opt).astype("Int64")
|
|
131
|
-
df[self.date_column] = df[self.date_column].dt.floor("D").
|
|
131
|
+
df[self.date_column] = df[self.date_column].dt.floor("D").astype(np.int64) // 1_000_000
|
|
132
132
|
df[self.date_column] = df[self.date_column].apply(self._int_to_opt).astype("Int64")
|
|
133
133
|
|
|
134
134
|
self.logger.info(f"Date after convertion to timestamp: {df[self.date_column]}")
|
|
@@ -3,7 +3,15 @@ from typing import Dict, List, Optional, Union
|
|
|
3
3
|
|
|
4
4
|
import pandas as pd
|
|
5
5
|
|
|
6
|
-
from upgini.metadata import
|
|
6
|
+
from upgini.metadata import (
|
|
7
|
+
ENTITY_SYSTEM_RECORD_ID,
|
|
8
|
+
EVAL_SET_INDEX,
|
|
9
|
+
SORT_ID,
|
|
10
|
+
SYSTEM_RECORD_ID,
|
|
11
|
+
TARGET,
|
|
12
|
+
ModelTaskType,
|
|
13
|
+
SearchKey,
|
|
14
|
+
)
|
|
7
15
|
from upgini.resource_bundle import ResourceBundle
|
|
8
16
|
from upgini.utils.datetime_utils import DateTimeSearchKeyConverter
|
|
9
17
|
from upgini.utils.target_utils import define_task
|
|
@@ -143,6 +151,8 @@ def clean_full_duplicates(
|
|
|
143
151
|
unique_columns = df.columns.tolist()
|
|
144
152
|
if SYSTEM_RECORD_ID in unique_columns:
|
|
145
153
|
unique_columns.remove(SYSTEM_RECORD_ID)
|
|
154
|
+
if ENTITY_SYSTEM_RECORD_ID in unique_columns:
|
|
155
|
+
unique_columns.remove(ENTITY_SYSTEM_RECORD_ID)
|
|
146
156
|
if SORT_ID in unique_columns:
|
|
147
157
|
unique_columns.remove(SORT_ID)
|
|
148
158
|
if EVAL_SET_INDEX in unique_columns:
|
upgini/utils/email_utils.py
CHANGED
|
@@ -38,11 +38,13 @@ class EmailSearchKeyConverter:
|
|
|
38
38
|
email_column: str,
|
|
39
39
|
hem_column: Optional[str],
|
|
40
40
|
search_keys: Dict[str, SearchKey],
|
|
41
|
+
unnest_search_keys: Optional[List[str]] = None,
|
|
41
42
|
logger: Optional[logging.Logger] = None,
|
|
42
43
|
):
|
|
43
44
|
self.email_column = email_column
|
|
44
45
|
self.hem_column = hem_column
|
|
45
46
|
self.search_keys = search_keys
|
|
47
|
+
self.unnest_search_keys = unnest_search_keys
|
|
46
48
|
if logger is not None:
|
|
47
49
|
self.logger = logger
|
|
48
50
|
else:
|
|
@@ -80,9 +82,12 @@ class EmailSearchKeyConverter:
|
|
|
80
82
|
del self.search_keys[self.email_column]
|
|
81
83
|
return df
|
|
82
84
|
self.search_keys[self.HEM_COLUMN_NAME] = SearchKey.HEM
|
|
85
|
+
self.unnest_search_keys.append(self.HEM_COLUMN_NAME)
|
|
83
86
|
self.email_converted_to_hem = True
|
|
84
87
|
|
|
85
88
|
del self.search_keys[self.email_column]
|
|
89
|
+
if self.email_column in self.unnest_search_keys:
|
|
90
|
+
self.unnest_search_keys.remove(self.email_column)
|
|
86
91
|
|
|
87
92
|
df[self.EMAIL_ONE_DOMAIN_COLUMN_NAME] = df[self.email_column].apply(self._email_to_one_domain)
|
|
88
93
|
|
|
@@ -1,12 +1,12 @@
|
|
|
1
1
|
upgini/__init__.py,sha256=asENHgEVHQBIkV-e_0IhE_ZWqkCG6398U3ZLrNzAH6k,407
|
|
2
2
|
upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
|
|
3
|
-
upgini/dataset.py,sha256=
|
|
3
|
+
upgini/dataset.py,sha256=qdIxHiDGZT_iNTBswNeIuc9TPfvUlNqvSmRqMyigZBM,46187
|
|
4
4
|
upgini/errors.py,sha256=pdzQl3MKuK52yvncxMWMRWeSIOGhUFzpQoszoRFBOk0,958
|
|
5
|
-
upgini/features_enricher.py,sha256=
|
|
5
|
+
upgini/features_enricher.py,sha256=wsrm9uwIr7hCNLBXTEuw4nIuIXfJrsZ7RWFeG24tTzI,181665
|
|
6
6
|
upgini/fingerprint.js,sha256=VygVIQlN1v4NGZfjHqtRogOw8zjTnnMNJg_f7M5iGQU,33442
|
|
7
7
|
upgini/http.py,sha256=zaO86LBBLmkieGbgYifk29eVoPCxXimZQ8YkQtKcM0I,42244
|
|
8
|
-
upgini/metadata.py,sha256=
|
|
9
|
-
upgini/metrics.py,sha256=
|
|
8
|
+
upgini/metadata.py,sha256=TNZbtIuxYkBFGQu3gGm2flA6vsKyUPN4Q-Du3fFjmSM,10101
|
|
9
|
+
upgini/metrics.py,sha256=YhyPik38cBI5x5KfdiE_qocJnUjZbSqUj8GUtCqnG0g,29648
|
|
10
10
|
upgini/search_task.py,sha256=tmJ17WUxv3J5NWrYUJB_NKdZ792Ifz8Z8UnDXeQnpss,17077
|
|
11
11
|
upgini/spinner.py,sha256=Dm1dQ5F_z_Ua2odLxZX7OypcOX9tSx_vE5MGaKtUmfw,1118
|
|
12
12
|
upgini/version_validator.py,sha256=rDIncP6BEko4J2F2hUcMOtKm_vZbI4ICWcNcw8hrwM4,1400
|
|
@@ -29,22 +29,22 @@ upgini/normalizer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU
|
|
|
29
29
|
upgini/normalizer/phone_normalizer.py,sha256=_SYMX4GTgwzRXArK54Jp3vUBE5d4jZxSVyze-0tqzg0,9996
|
|
30
30
|
upgini/resource_bundle/__init__.py,sha256=hdvbqL0b0xMWbY6-kiYGsW1ro2GMiWpxxsO9uCv-h9Q,8379
|
|
31
31
|
upgini/resource_bundle/exceptions.py,sha256=5fRvx0_vWdE1-7HcSgF0tckB4A9AKyf5RiinZkInTsI,621
|
|
32
|
-
upgini/resource_bundle/strings.properties,sha256
|
|
32
|
+
upgini/resource_bundle/strings.properties,sha256=-JDIa0nAoA5utK7UZZAUgLDsozJNI08dDcbIaOSsvQg,26353
|
|
33
33
|
upgini/resource_bundle/strings_widget.properties,sha256=gOdqvZWntP2LCza_tyVk1_yRYcG4c04K9sQOAVhF_gw,1577
|
|
34
34
|
upgini/sampler/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
35
35
|
upgini/sampler/base.py,sha256=CC-DvPbrN7zp5--SVFuUqkVmdWM_5F7R0Do98ETV82U,6421
|
|
36
36
|
upgini/sampler/random_under_sampler.py,sha256=XU4c2swPIFxVXHOPpxgM2bUao0Xm-aoMmd6fKjIuV5s,4068
|
|
37
37
|
upgini/sampler/utils.py,sha256=PYOk3kKSnFlyxcpdtDNLBEEhTB4lO_iP7pQHqeUcmAc,20211
|
|
38
38
|
upgini/utils/__init__.py,sha256=YVum3lRKpyfqoJy_7HJyU6SmIgbmG8QLkHIpibE_ud8,842
|
|
39
|
-
upgini/utils/base_search_key_detector.py,sha256=
|
|
39
|
+
upgini/utils/base_search_key_detector.py,sha256=VvEdamjJT1wypsH6NAfOkPp7dHo7nxhl7LhwX7Z9N5w,1025
|
|
40
40
|
upgini/utils/blocked_time_series.py,sha256=dMz5ewk3PsoeOrc3lDzInCVPS9u_2XQkV0W6PuMMjPg,3380
|
|
41
41
|
upgini/utils/country_utils.py,sha256=pV8TBURthYqwSOfH1lxfYc2blm3OvfLFCMvRv8rKTp4,6511
|
|
42
42
|
upgini/utils/custom_loss_utils.py,sha256=DBslpjWGPt7xTeypt78baR59012SYphbPsO_YLKdilo,3972
|
|
43
43
|
upgini/utils/cv_utils.py,sha256=Tn01RJvpZGZh0PUQUimlBkV-AXwe7s6yjCNFtw352Uc,3525
|
|
44
|
-
upgini/utils/datetime_utils.py,sha256=
|
|
45
|
-
upgini/utils/deduplicate_utils.py,sha256=
|
|
44
|
+
upgini/utils/datetime_utils.py,sha256=RW9eGCGQyYBsIU9XbYKt4hQiXUNppb4Grszg4EdKeY4,10398
|
|
45
|
+
upgini/utils/deduplicate_utils.py,sha256=Zvs7zW4QzaERQmJNPrTVf2ZTVBkBLOycFCzyMwtXuV8,8770
|
|
46
46
|
upgini/utils/display_utils.py,sha256=LKoSwjrE0xgS5_cqVhc2og2CQ1UCZ1nTI2VKboIhoQA,10858
|
|
47
|
-
upgini/utils/email_utils.py,sha256=
|
|
47
|
+
upgini/utils/email_utils.py,sha256=KHqIUagBWd3jOj3V7mW0ZkBOc-2XzAIA3p1xxZgy-L4,3813
|
|
48
48
|
upgini/utils/fallback_progress_bar.py,sha256=cdbd1XGcWm4Ed4eAqV2_St3z7uC_kkH22gEyrN5ub6M,1090
|
|
49
49
|
upgini/utils/features_validator.py,sha256=PgKNt5dyqfErTvjtRNNUS9g7GFqHBtAtnsfA-V5UO1A,3307
|
|
50
50
|
upgini/utils/format.py,sha256=Yv5cvvSs2bOLUzzNu96Pu33VMDNbabio92QepUj41jU,243
|
|
@@ -56,8 +56,8 @@ upgini/utils/sklearn_ext.py,sha256=e1aMNXk1zUt7uFnl0FcUF0zOnaXSE7z5xBHmJPknUVs,4
|
|
|
56
56
|
upgini/utils/target_utils.py,sha256=Y96_PJ5cC-WsEbeqg20v9uqywDQobLoTb-xoP7S3o4E,7807
|
|
57
57
|
upgini/utils/track_info.py,sha256=p8gmuHhLamZF5JG7K9DeK-PcytQhlFCR29lyRr-wq_U,5665
|
|
58
58
|
upgini/utils/warning_counter.py,sha256=dIWBB4dI5XRRJZudvIlqlIYKEiwLLPcXarsZuYRt338,227
|
|
59
|
-
upgini-1.1.
|
|
60
|
-
upgini-1.1.
|
|
61
|
-
upgini-1.1.
|
|
62
|
-
upgini-1.1.
|
|
63
|
-
upgini-1.1.
|
|
59
|
+
upgini-1.1.278a1.dist-info/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
|
|
60
|
+
upgini-1.1.278a1.dist-info/METADATA,sha256=q6o1ge7o56ZvJk11K_v2tfPAREHvZMW9kHPbotGeJEo,48158
|
|
61
|
+
upgini-1.1.278a1.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
|
62
|
+
upgini-1.1.278a1.dist-info/top_level.txt,sha256=OFhTGiDIWKl5gFI49qvWq1R9IKflPaE2PekcbDXDtx4,7
|
|
63
|
+
upgini-1.1.278a1.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|