upgini 1.1.275__py3-none-any.whl → 1.1.275a1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- upgini/ads.py +2 -6
- upgini/autofe/date.py +2 -9
- upgini/data_source/data_source_publisher.py +1 -1
- upgini/dataset.py +13 -6
- upgini/features_enricher.py +220 -154
- upgini/metadata.py +9 -1
- upgini/metrics.py +0 -12
- upgini/normalizer/phone_normalizer.py +2 -2
- upgini/resource_bundle/strings.properties +2 -2
- upgini/utils/__init__.py +2 -3
- upgini/utils/base_search_key_detector.py +14 -12
- upgini/utils/country_utils.py +2 -2
- upgini/utils/datetime_utils.py +4 -7
- upgini/utils/deduplicate_utils.py +11 -1
- upgini/utils/email_utils.py +7 -2
- upgini/utils/features_validator.py +1 -2
- upgini/utils/target_utils.py +1 -1
- upgini/utils/track_info.py +13 -25
- {upgini-1.1.275.dist-info → upgini-1.1.275a1.dist-info}/METADATA +2 -2
- {upgini-1.1.275.dist-info → upgini-1.1.275a1.dist-info}/RECORD +23 -23
- {upgini-1.1.275.dist-info → upgini-1.1.275a1.dist-info}/LICENSE +0 -0
- {upgini-1.1.275.dist-info → upgini-1.1.275a1.dist-info}/WHEEL +0 -0
- {upgini-1.1.275.dist-info → upgini-1.1.275a1.dist-info}/top_level.txt +0 -0
upgini/features_enricher.py
CHANGED
|
@@ -1,5 +1,4 @@
|
|
|
1
1
|
import dataclasses
|
|
2
|
-
import datetime
|
|
3
2
|
import gc
|
|
4
3
|
import hashlib
|
|
5
4
|
import itertools
|
|
@@ -11,6 +10,7 @@ import sys
|
|
|
11
10
|
import tempfile
|
|
12
11
|
import time
|
|
13
12
|
import uuid
|
|
13
|
+
from collections import Counter
|
|
14
14
|
from dataclasses import dataclass
|
|
15
15
|
from threading import Thread
|
|
16
16
|
from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union
|
|
@@ -21,7 +21,6 @@ from pandas.api.types import (
|
|
|
21
21
|
is_bool,
|
|
22
22
|
is_datetime64_any_dtype,
|
|
23
23
|
is_numeric_dtype,
|
|
24
|
-
is_object_dtype,
|
|
25
24
|
is_period_dtype,
|
|
26
25
|
is_string_dtype,
|
|
27
26
|
)
|
|
@@ -45,9 +44,11 @@ from upgini.mdc import MDC
|
|
|
45
44
|
from upgini.metadata import (
|
|
46
45
|
COUNTRY,
|
|
47
46
|
DEFAULT_INDEX,
|
|
47
|
+
ENTITY_SYSTEM_RECORD_ID,
|
|
48
48
|
EVAL_SET_INDEX,
|
|
49
49
|
ORIGINAL_INDEX,
|
|
50
50
|
RENAMED_INDEX,
|
|
51
|
+
SEARCH_KEY_UNNEST,
|
|
51
52
|
SORT_ID,
|
|
52
53
|
SYSTEM_RECORD_ID,
|
|
53
54
|
TARGET,
|
|
@@ -148,7 +149,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
148
149
|
"""
|
|
149
150
|
|
|
150
151
|
TARGET_NAME = "target"
|
|
151
|
-
CURRENT_DATE = "current_date"
|
|
152
152
|
RANDOM_STATE = 42
|
|
153
153
|
CALCULATE_METRICS_THRESHOLD = 50_000_000
|
|
154
154
|
CALCULATE_METRICS_MIN_THRESHOLD = 500
|
|
@@ -210,7 +210,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
210
210
|
client_ip: Optional[str] = None,
|
|
211
211
|
client_visitorid: Optional[str] = None,
|
|
212
212
|
custom_bundle_config: Optional[str] = None,
|
|
213
|
-
add_date_if_missing: bool = True,
|
|
214
213
|
**kwargs,
|
|
215
214
|
):
|
|
216
215
|
self.bundle = get_custom_bundle(custom_bundle_config)
|
|
@@ -321,7 +320,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
321
320
|
self.raise_validation_error = raise_validation_error
|
|
322
321
|
self.exclude_columns = exclude_columns
|
|
323
322
|
self.baseline_score_column = baseline_score_column
|
|
324
|
-
self.add_date_if_missing = add_date_if_missing
|
|
325
323
|
|
|
326
324
|
def _get_api_key(self):
|
|
327
325
|
return self._api_key
|
|
@@ -425,9 +423,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
425
423
|
|
|
426
424
|
self.__validate_search_keys(self.search_keys, self.search_id)
|
|
427
425
|
|
|
428
|
-
# Validate client estimator params
|
|
429
|
-
self._get_client_cat_features(estimator, X, self.search_keys)
|
|
430
|
-
|
|
431
426
|
try:
|
|
432
427
|
self.X = X
|
|
433
428
|
self.y = y
|
|
@@ -821,7 +816,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
821
816
|
trace_id = trace_id or str(uuid.uuid4())
|
|
822
817
|
start_time = time.time()
|
|
823
818
|
with MDC(trace_id=trace_id):
|
|
824
|
-
self.logger.info("Start calculate metrics")
|
|
825
819
|
if len(args) > 0:
|
|
826
820
|
msg = f"WARNING: Unsupported positional arguments for calculate_metrics: {args}"
|
|
827
821
|
self.logger.warning(msg)
|
|
@@ -873,9 +867,22 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
873
867
|
self.__display_support_link(msg)
|
|
874
868
|
return None
|
|
875
869
|
|
|
876
|
-
cat_features
|
|
877
|
-
|
|
878
|
-
|
|
870
|
+
cat_features = None
|
|
871
|
+
search_keys_for_metrics = []
|
|
872
|
+
if (
|
|
873
|
+
estimator is not None
|
|
874
|
+
and hasattr(estimator, "get_param")
|
|
875
|
+
and estimator.get_param("cat_features") is not None
|
|
876
|
+
):
|
|
877
|
+
cat_features = estimator.get_param("cat_features")
|
|
878
|
+
if len(cat_features) > 0 and isinstance(cat_features[0], int):
|
|
879
|
+
cat_features = [effective_X.columns[i] for i in cat_features]
|
|
880
|
+
for cat_feature in cat_features:
|
|
881
|
+
if cat_feature in self.search_keys:
|
|
882
|
+
if self.search_keys[cat_feature] in [SearchKey.COUNTRY, SearchKey.POSTAL_CODE]:
|
|
883
|
+
search_keys_for_metrics.append(cat_feature)
|
|
884
|
+
else:
|
|
885
|
+
raise ValidationError(self.bundle.get("cat_feature_search_key").format(cat_feature))
|
|
879
886
|
|
|
880
887
|
prepared_data = self._prepare_data_for_metrics(
|
|
881
888
|
trace_id=trace_id,
|
|
@@ -890,7 +897,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
890
897
|
search_keys_for_metrics=search_keys_for_metrics,
|
|
891
898
|
progress_bar=progress_bar,
|
|
892
899
|
progress_callback=progress_callback,
|
|
893
|
-
cat_features=cat_features,
|
|
894
900
|
)
|
|
895
901
|
if prepared_data is None:
|
|
896
902
|
return None
|
|
@@ -1178,6 +1184,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1178
1184
|
search_keys = self.search_keys.copy()
|
|
1179
1185
|
search_keys = self.__prepare_search_keys(x, search_keys, is_demo_dataset, is_transform=True, silent_mode=True)
|
|
1180
1186
|
|
|
1187
|
+
unnest_search_keys = []
|
|
1188
|
+
|
|
1181
1189
|
extended_X = x.copy()
|
|
1182
1190
|
generated_features = []
|
|
1183
1191
|
date_column = self._get_date_column(search_keys)
|
|
@@ -1188,7 +1196,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1188
1196
|
email_column = self._get_email_column(search_keys)
|
|
1189
1197
|
hem_column = self._get_hem_column(search_keys)
|
|
1190
1198
|
if email_column:
|
|
1191
|
-
converter = EmailSearchKeyConverter(email_column, hem_column, search_keys, self.logger)
|
|
1199
|
+
converter = EmailSearchKeyConverter(email_column, hem_column, search_keys, unnest_search_keys, self.logger)
|
|
1192
1200
|
extended_X = converter.convert(extended_X)
|
|
1193
1201
|
generated_features.extend(converter.generated_features)
|
|
1194
1202
|
if (
|
|
@@ -1266,29 +1274,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1266
1274
|
|
|
1267
1275
|
return _cv, groups
|
|
1268
1276
|
|
|
1269
|
-
def _get_client_cat_features(
|
|
1270
|
-
self, estimator: Optional[Any], X: pd.DataFrame, search_keys: Dict[str, SearchKey]
|
|
1271
|
-
) -> Optional[List[str]]:
|
|
1272
|
-
cat_features = None
|
|
1273
|
-
search_keys_for_metrics = []
|
|
1274
|
-
if (
|
|
1275
|
-
estimator is not None
|
|
1276
|
-
and hasattr(estimator, "get_param")
|
|
1277
|
-
and estimator.get_param("cat_features") is not None
|
|
1278
|
-
):
|
|
1279
|
-
cat_features = estimator.get_param("cat_features")
|
|
1280
|
-
if len(cat_features) > 0:
|
|
1281
|
-
if all([isinstance(f, int) for f in cat_features]):
|
|
1282
|
-
cat_features = [X.columns[i] for i in cat_features]
|
|
1283
|
-
self.logger.info(f"Collected categorical features {cat_features} from user estimator")
|
|
1284
|
-
for cat_feature in cat_features:
|
|
1285
|
-
if cat_feature in search_keys:
|
|
1286
|
-
if search_keys[cat_feature] in [SearchKey.COUNTRY, SearchKey.POSTAL_CODE]:
|
|
1287
|
-
search_keys_for_metrics.append(cat_feature)
|
|
1288
|
-
else:
|
|
1289
|
-
raise ValidationError(self.bundle.get("cat_feature_search_key").format(cat_feature))
|
|
1290
|
-
return cat_features, search_keys_for_metrics
|
|
1291
|
-
|
|
1292
1277
|
def _prepare_data_for_metrics(
|
|
1293
1278
|
self,
|
|
1294
1279
|
trace_id: str,
|
|
@@ -1303,7 +1288,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1303
1288
|
search_keys_for_metrics: Optional[List[str]] = None,
|
|
1304
1289
|
progress_bar: Optional[ProgressBar] = None,
|
|
1305
1290
|
progress_callback: Optional[Callable[[SearchProgress], Any]] = None,
|
|
1306
|
-
cat_features: Optional[List[str]] = None,
|
|
1307
1291
|
):
|
|
1308
1292
|
is_input_same_as_fit, X, y, eval_set = self._is_input_same_as_fit(X, y, eval_set)
|
|
1309
1293
|
is_demo_dataset = hash_input(X, y, eval_set) in DEMO_DATASET_HASHES
|
|
@@ -1361,8 +1345,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1361
1345
|
|
|
1362
1346
|
# Detect and drop high cardinality columns in train
|
|
1363
1347
|
columns_with_high_cardinality = FeaturesValidator.find_high_cardinality(fitting_X)
|
|
1364
|
-
|
|
1365
|
-
|
|
1348
|
+
columns_with_high_cardinality = [
|
|
1349
|
+
c for c in columns_with_high_cardinality if c not in (self.generate_features or [])
|
|
1350
|
+
]
|
|
1366
1351
|
if len(columns_with_high_cardinality) > 0:
|
|
1367
1352
|
self.logger.warning(
|
|
1368
1353
|
f"High cardinality columns {columns_with_high_cardinality} will be dropped for metrics calculation"
|
|
@@ -1824,11 +1809,10 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1824
1809
|
else:
|
|
1825
1810
|
features_section = ""
|
|
1826
1811
|
|
|
1827
|
-
|
|
1828
|
-
api_example = f"""curl 'https://search.upgini.com/online/api/http_inference_trigger?search_id={search_id}' \\
|
|
1812
|
+
api_example = f"""curl 'https://inference-upgini.azurewebsites.net/api/http_inference_trigger' \\
|
|
1829
1813
|
-H 'Authorization: {self.api_key}' \\
|
|
1830
1814
|
-H 'Content-Type: application/json' \\
|
|
1831
|
-
-d '{{"search_keys": {keys}{features_section}}}'"""
|
|
1815
|
+
-d '{{"search_id": "{self._search_task.search_task_id}", "search_keys": {keys}{features_section}}}'"""
|
|
1832
1816
|
return api_example
|
|
1833
1817
|
|
|
1834
1818
|
def _get_copy_of_runtime_parameters(self) -> RuntimeParameters:
|
|
@@ -1923,13 +1907,38 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1923
1907
|
generated_features.extend(converter.generated_features)
|
|
1924
1908
|
else:
|
|
1925
1909
|
self.logger.info("Input dataset hasn't date column")
|
|
1926
|
-
|
|
1927
|
-
|
|
1910
|
+
|
|
1911
|
+
# Don't pass all features in backend on transform
|
|
1912
|
+
original_features_for_transform = []
|
|
1913
|
+
runtime_parameters = self._get_copy_of_runtime_parameters()
|
|
1914
|
+
features_not_to_pass = [column for column in df.columns if column not in search_keys.keys()]
|
|
1915
|
+
if len(features_not_to_pass) > 0:
|
|
1916
|
+
# Pass only features that need for transform
|
|
1917
|
+
features_for_transform = self._search_task.get_features_for_transform()
|
|
1918
|
+
if features_for_transform is not None and len(features_for_transform) > 0:
|
|
1919
|
+
file_metadata = self._search_task.get_file_metadata(trace_id)
|
|
1920
|
+
original_features_for_transform = [
|
|
1921
|
+
c.originalName or c.name for c in file_metadata.columns if c.name in features_for_transform
|
|
1922
|
+
]
|
|
1923
|
+
|
|
1924
|
+
runtime_parameters.properties["features_for_embeddings"] = ",".join(features_for_transform)
|
|
1925
|
+
|
|
1926
|
+
columns_for_system_record_id = sorted(list(search_keys.keys()) + (original_features_for_transform))
|
|
1927
|
+
|
|
1928
|
+
df[ENTITY_SYSTEM_RECORD_ID] = pd.util.hash_pandas_object(
|
|
1929
|
+
df[columns_for_system_record_id], index=False
|
|
1930
|
+
).astype("Float64")
|
|
1931
|
+
|
|
1932
|
+
# Explode multiple search keys
|
|
1933
|
+
df, unnest_search_keys = self._explode_multiple_search_keys(df, search_keys)
|
|
1934
|
+
|
|
1928
1935
|
email_column = self._get_email_column(search_keys)
|
|
1929
1936
|
hem_column = self._get_hem_column(search_keys)
|
|
1930
1937
|
email_converted_to_hem = False
|
|
1931
1938
|
if email_column:
|
|
1932
|
-
converter = EmailSearchKeyConverter(
|
|
1939
|
+
converter = EmailSearchKeyConverter(
|
|
1940
|
+
email_column, hem_column, search_keys, unnest_search_keys, self.logger
|
|
1941
|
+
)
|
|
1933
1942
|
df = converter.convert(df)
|
|
1934
1943
|
generated_features.extend(converter.generated_features)
|
|
1935
1944
|
email_converted_to_hem = converter.email_converted_to_hem
|
|
@@ -1943,30 +1952,21 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1943
1952
|
generated_features = [f for f in generated_features if f in self.fit_generated_features]
|
|
1944
1953
|
|
|
1945
1954
|
meaning_types = {col: key.value for col, key in search_keys.items()}
|
|
1946
|
-
non_keys_columns = [column for column in df.columns if column not in search_keys.keys()]
|
|
1955
|
+
# non_keys_columns = [column for column in df.columns if column not in search_keys.keys()]
|
|
1956
|
+
for col in original_features_for_transform:
|
|
1957
|
+
meaning_types[col] = FileColumnMeaningType.FEATURE
|
|
1958
|
+
features_not_to_pass = [column for column in features_not_to_pass if column not in search_keys.keys()]
|
|
1947
1959
|
|
|
1948
1960
|
if email_converted_to_hem:
|
|
1949
|
-
|
|
1950
|
-
|
|
1951
|
-
# Don't pass features in backend on transform
|
|
1952
|
-
original_features_for_transform = None
|
|
1953
|
-
runtime_parameters = self._get_copy_of_runtime_parameters()
|
|
1954
|
-
if len(non_keys_columns) > 0:
|
|
1955
|
-
# Pass only features that need for transform
|
|
1956
|
-
features_for_transform = self._search_task.get_features_for_transform()
|
|
1957
|
-
if features_for_transform is not None and len(features_for_transform) > 0:
|
|
1958
|
-
file_metadata = self._search_task.get_file_metadata(trace_id)
|
|
1959
|
-
original_features_for_transform = [
|
|
1960
|
-
c.originalName or c.name for c in file_metadata.columns if c.name in features_for_transform
|
|
1961
|
-
]
|
|
1962
|
-
non_keys_columns = [c for c in non_keys_columns if c not in original_features_for_transform]
|
|
1961
|
+
features_not_to_pass.append(email_column)
|
|
1963
1962
|
|
|
1964
|
-
|
|
1963
|
+
features_not_to_pass = [c for c in features_not_to_pass if c not in original_features_for_transform]
|
|
1964
|
+
columns_for_system_record_id = sorted(list(search_keys.keys()) + (original_features_for_transform))
|
|
1965
1965
|
|
|
1966
1966
|
if add_fit_system_record_id:
|
|
1967
1967
|
df = self.__add_fit_system_record_id(df, dict(), search_keys)
|
|
1968
1968
|
df = df.rename(columns={SYSTEM_RECORD_ID: SORT_ID})
|
|
1969
|
-
|
|
1969
|
+
features_not_to_pass.append(SORT_ID)
|
|
1970
1970
|
|
|
1971
1971
|
columns_for_system_record_id = sorted(list(search_keys.keys()) + (original_features_for_transform or []))
|
|
1972
1972
|
|
|
@@ -1974,16 +1974,19 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1974
1974
|
"Float64"
|
|
1975
1975
|
)
|
|
1976
1976
|
meaning_types[SYSTEM_RECORD_ID] = FileColumnMeaningType.SYSTEM_RECORD_ID
|
|
1977
|
+
meaning_types[ENTITY_SYSTEM_RECORD_ID] = FileColumnMeaningType.ENTITY_SYSTEM_RECORD_ID
|
|
1978
|
+
if SEARCH_KEY_UNNEST in df.columns:
|
|
1979
|
+
meaning_types[SEARCH_KEY_UNNEST] = FileColumnMeaningType.UNNEST_KEY
|
|
1977
1980
|
|
|
1978
1981
|
df = df.reset_index(drop=True)
|
|
1979
|
-
system_columns_with_original_index = [SYSTEM_RECORD_ID] + generated_features
|
|
1982
|
+
system_columns_with_original_index = [SYSTEM_RECORD_ID, ENTITY_SYSTEM_RECORD_ID] + generated_features
|
|
1980
1983
|
if add_fit_system_record_id:
|
|
1981
1984
|
system_columns_with_original_index.append(SORT_ID)
|
|
1982
1985
|
df_with_original_index = df[system_columns_with_original_index].copy()
|
|
1983
1986
|
|
|
1984
1987
|
combined_search_keys = combine_search_keys(search_keys.keys())
|
|
1985
1988
|
|
|
1986
|
-
df_without_features = df.drop(columns=
|
|
1989
|
+
df_without_features = df.drop(columns=features_not_to_pass)
|
|
1987
1990
|
|
|
1988
1991
|
df_without_features = clean_full_duplicates(
|
|
1989
1992
|
df_without_features, self.logger, silent=silent_mode, bundle=self.bundle
|
|
@@ -2139,6 +2142,14 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2139
2142
|
|
|
2140
2143
|
key_types = search_keys.values()
|
|
2141
2144
|
|
|
2145
|
+
# Multiple search keys allowed only for PHONE, IP, POSTAL_CODE, EMAIL, HEM
|
|
2146
|
+
multi_keys = [key for key, count in Counter(key_types).items() if count > 1]
|
|
2147
|
+
for multi_key in multi_keys:
|
|
2148
|
+
if multi_key not in [SearchKey.PHONE, SearchKey.IP, SearchKey.POSTAL_CODE, SearchKey.EMAIL, SearchKey.HEM]:
|
|
2149
|
+
msg = self.bundle.get("unsupported_multi_key").format(multi_key)
|
|
2150
|
+
self.logger.warning(msg)
|
|
2151
|
+
raise ValidationError(msg)
|
|
2152
|
+
|
|
2142
2153
|
if SearchKey.DATE in key_types and SearchKey.DATETIME in key_types:
|
|
2143
2154
|
msg = self.bundle.get("date_and_datetime_simultanious")
|
|
2144
2155
|
self.logger.warning(msg)
|
|
@@ -2154,11 +2165,11 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2154
2165
|
self.logger.warning(msg)
|
|
2155
2166
|
raise ValidationError(msg)
|
|
2156
2167
|
|
|
2157
|
-
for key_type in SearchKey.__members__.values():
|
|
2158
|
-
|
|
2159
|
-
|
|
2160
|
-
|
|
2161
|
-
|
|
2168
|
+
# for key_type in SearchKey.__members__.values():
|
|
2169
|
+
# if key_type != SearchKey.CUSTOM_KEY and list(key_types).count(key_type) > 1:
|
|
2170
|
+
# msg = self.bundle.get("multiple_search_key").format(key_type)
|
|
2171
|
+
# self.logger.warning(msg)
|
|
2172
|
+
# raise ValidationError(msg)
|
|
2162
2173
|
|
|
2163
2174
|
# non_personal_keys = set(SearchKey.__members__.values()) - set(SearchKey.personal_keys())
|
|
2164
2175
|
# if (
|
|
@@ -2294,16 +2305,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2294
2305
|
self.fit_generated_features.extend(converter.generated_features)
|
|
2295
2306
|
else:
|
|
2296
2307
|
self.logger.info("Input dataset hasn't date column")
|
|
2297
|
-
|
|
2298
|
-
df = self._add_current_date_as_key(df, self.fit_search_keys, self.logger, self.bundle)
|
|
2299
|
-
email_column = self._get_email_column(self.fit_search_keys)
|
|
2300
|
-
hem_column = self._get_hem_column(self.fit_search_keys)
|
|
2301
|
-
email_converted_to_hem = False
|
|
2302
|
-
if email_column:
|
|
2303
|
-
converter = EmailSearchKeyConverter(email_column, hem_column, self.fit_search_keys, self.logger)
|
|
2304
|
-
df = converter.convert(df)
|
|
2305
|
-
self.fit_generated_features.extend(converter.generated_features)
|
|
2306
|
-
email_converted_to_hem = converter.email_converted_to_hem
|
|
2308
|
+
|
|
2307
2309
|
if (
|
|
2308
2310
|
self.detect_missing_search_keys
|
|
2309
2311
|
and list(self.fit_search_keys.values()) == [SearchKey.DATE]
|
|
@@ -2312,7 +2314,37 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2312
2314
|
converter = IpToCountrySearchKeyConverter(self.fit_search_keys, self.logger)
|
|
2313
2315
|
df = converter.convert(df)
|
|
2314
2316
|
|
|
2317
|
+
# Explode multiple search keys
|
|
2315
2318
|
non_feature_columns = [self.TARGET_NAME, EVAL_SET_INDEX] + list(self.fit_search_keys.keys())
|
|
2319
|
+
meaning_types = {
|
|
2320
|
+
**{col: key.value for col, key in self.fit_search_keys.items()},
|
|
2321
|
+
**{str(c): FileColumnMeaningType.FEATURE for c in df.columns if c not in non_feature_columns},
|
|
2322
|
+
}
|
|
2323
|
+
meaning_types[self.TARGET_NAME] = FileColumnMeaningType.TARGET
|
|
2324
|
+
if eval_set is not None and len(eval_set) > 0:
|
|
2325
|
+
meaning_types[EVAL_SET_INDEX] = FileColumnMeaningType.EVAL_SET_INDEX
|
|
2326
|
+
df = self.__add_fit_system_record_id(df, meaning_types, self.fit_search_keys, ENTITY_SYSTEM_RECORD_ID)
|
|
2327
|
+
|
|
2328
|
+
# TODO check that this is correct for enrichment
|
|
2329
|
+
self.df_with_original_index = df.copy()
|
|
2330
|
+
|
|
2331
|
+
df, unnest_search_keys = self._explode_multiple_search_keys(df, self.fit_search_keys)
|
|
2332
|
+
|
|
2333
|
+
# Convert EMAIL to HEM after unnesting to do it only with one column
|
|
2334
|
+
email_column = self._get_email_column(self.fit_search_keys)
|
|
2335
|
+
hem_column = self._get_hem_column(self.fit_search_keys)
|
|
2336
|
+
email_converted_to_hem = False
|
|
2337
|
+
if email_column:
|
|
2338
|
+
converter = EmailSearchKeyConverter(
|
|
2339
|
+
email_column, hem_column, self.fit_search_keys, unnest_search_keys, self.logger
|
|
2340
|
+
)
|
|
2341
|
+
df = converter.convert(df)
|
|
2342
|
+
self.fit_generated_features.extend(converter.generated_features)
|
|
2343
|
+
email_converted_to_hem = converter.email_converted_to_hem
|
|
2344
|
+
|
|
2345
|
+
non_feature_columns = [self.TARGET_NAME, EVAL_SET_INDEX, ENTITY_SYSTEM_RECORD_ID, SEARCH_KEY_UNNEST] + list(
|
|
2346
|
+
self.fit_search_keys.keys()
|
|
2347
|
+
)
|
|
2316
2348
|
if email_converted_to_hem:
|
|
2317
2349
|
non_feature_columns.append(email_column)
|
|
2318
2350
|
if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
|
|
@@ -2336,12 +2368,14 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2336
2368
|
**{str(c): FileColumnMeaningType.FEATURE for c in df.columns if c not in non_feature_columns},
|
|
2337
2369
|
}
|
|
2338
2370
|
meaning_types[self.TARGET_NAME] = FileColumnMeaningType.TARGET
|
|
2371
|
+
meaning_types[ENTITY_SYSTEM_RECORD_ID] = FileColumnMeaningType.ENTITY_SYSTEM_RECORD_ID
|
|
2372
|
+
if SEARCH_KEY_UNNEST in df.columns:
|
|
2373
|
+
meaning_types[SEARCH_KEY_UNNEST] = FileColumnMeaningType.UNNEST_KEY
|
|
2339
2374
|
if eval_set is not None and len(eval_set) > 0:
|
|
2340
2375
|
meaning_types[EVAL_SET_INDEX] = FileColumnMeaningType.EVAL_SET_INDEX
|
|
2341
2376
|
|
|
2342
|
-
df = self.__add_fit_system_record_id(df, meaning_types, self.fit_search_keys)
|
|
2377
|
+
df = self.__add_fit_system_record_id(df, meaning_types, self.fit_search_keys, SYSTEM_RECORD_ID)
|
|
2343
2378
|
|
|
2344
|
-
self.df_with_original_index = df.copy()
|
|
2345
2379
|
df = df.reset_index(drop=True).sort_values(by=SYSTEM_RECORD_ID).reset_index(drop=True)
|
|
2346
2380
|
|
|
2347
2381
|
combined_search_keys = combine_search_keys(self.fit_search_keys.keys())
|
|
@@ -2349,14 +2383,15 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2349
2383
|
dataset = Dataset(
|
|
2350
2384
|
"tds_" + str(uuid.uuid4()),
|
|
2351
2385
|
df=df,
|
|
2386
|
+
meaning_types=meaning_types,
|
|
2387
|
+
search_keys=combined_search_keys,
|
|
2388
|
+
unnest_search_keys=unnest_search_keys,
|
|
2352
2389
|
model_task_type=model_task_type,
|
|
2353
2390
|
date_format=self.date_format,
|
|
2354
2391
|
random_state=self.random_state,
|
|
2355
2392
|
rest_client=self.rest_client,
|
|
2356
2393
|
logger=self.logger,
|
|
2357
2394
|
)
|
|
2358
|
-
dataset.meaning_types = meaning_types
|
|
2359
|
-
dataset.search_keys = combined_search_keys
|
|
2360
2395
|
if email_converted_to_hem:
|
|
2361
2396
|
dataset.ignore_columns = [email_column]
|
|
2362
2397
|
|
|
@@ -2876,25 +2911,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2876
2911
|
if t in [SearchKey.DATE, SearchKey.DATETIME]:
|
|
2877
2912
|
return col
|
|
2878
2913
|
|
|
2879
|
-
@staticmethod
|
|
2880
|
-
def _add_current_date_as_key(
|
|
2881
|
-
df: pd.DataFrame, search_keys: Dict[str, SearchKey], logger: logging.Logger, bundle: ResourceBundle
|
|
2882
|
-
) -> pd.DataFrame:
|
|
2883
|
-
if (
|
|
2884
|
-
set(search_keys.values()) == {SearchKey.PHONE}
|
|
2885
|
-
or set(search_keys.values()) == {SearchKey.EMAIL}
|
|
2886
|
-
or set(search_keys.values()) == {SearchKey.HEM}
|
|
2887
|
-
or set(search_keys.values()) == {SearchKey.COUNTRY, SearchKey.POSTAL_CODE}
|
|
2888
|
-
):
|
|
2889
|
-
msg = bundle.get("current_date_added")
|
|
2890
|
-
print(msg)
|
|
2891
|
-
logger.warning(msg)
|
|
2892
|
-
df[FeaturesEnricher.CURRENT_DATE] = datetime.date.today()
|
|
2893
|
-
search_keys[FeaturesEnricher.CURRENT_DATE] = SearchKey.DATE
|
|
2894
|
-
converter = DateTimeSearchKeyConverter(FeaturesEnricher.CURRENT_DATE, None, logger, bundle)
|
|
2895
|
-
df = converter.convert(df)
|
|
2896
|
-
return df
|
|
2897
|
-
|
|
2898
2914
|
@staticmethod
|
|
2899
2915
|
def _get_group_columns(df: pd.DataFrame, search_keys: Dict[str, SearchKey]) -> List[str]:
|
|
2900
2916
|
return [
|
|
@@ -2905,15 +2921,19 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2905
2921
|
|
|
2906
2922
|
@staticmethod
|
|
2907
2923
|
def _get_email_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
|
|
2908
|
-
for col, t in search_keys.items()
|
|
2909
|
-
|
|
2910
|
-
|
|
2924
|
+
cols = [col for col, t in search_keys.items() if t == SearchKey.EMAIL]
|
|
2925
|
+
if len(cols) > 1:
|
|
2926
|
+
raise Exception("More than one email column found after unnest")
|
|
2927
|
+
if len(cols) == 1:
|
|
2928
|
+
return cols[0]
|
|
2911
2929
|
|
|
2912
2930
|
@staticmethod
|
|
2913
2931
|
def _get_hem_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
|
|
2914
|
-
for col, t in search_keys.items()
|
|
2915
|
-
|
|
2916
|
-
|
|
2932
|
+
cols = [col for col, t in search_keys.items() if t == SearchKey.HEM]
|
|
2933
|
+
if len(cols) > 1:
|
|
2934
|
+
raise Exception("More than one hem column found after unnest")
|
|
2935
|
+
if len(cols) == 1:
|
|
2936
|
+
return cols[0]
|
|
2917
2937
|
|
|
2918
2938
|
@staticmethod
|
|
2919
2939
|
def _get_phone_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
|
|
@@ -2921,8 +2941,42 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2921
2941
|
if t == SearchKey.PHONE:
|
|
2922
2942
|
return col
|
|
2923
2943
|
|
|
2944
|
+
def _explode_multiple_search_keys(self, df: pd.DataFrame, search_keys: Dict[str, SearchKey]) -> pd.DataFrame:
|
|
2945
|
+
# find groups of multiple search keys
|
|
2946
|
+
search_key_names_by_type: Dict[SearchKey, str] = dict()
|
|
2947
|
+
for key_name, key_type in search_keys.items():
|
|
2948
|
+
search_key_names_by_type[key_type] = search_key_names_by_type.get(key_type, []) + [key_name]
|
|
2949
|
+
search_key_names_by_type = {
|
|
2950
|
+
key_type: key_names for key_type, key_names in search_key_names_by_type.items() if len(key_names) > 1
|
|
2951
|
+
}
|
|
2952
|
+
if len(search_key_names_by_type) == 0:
|
|
2953
|
+
return df, []
|
|
2954
|
+
|
|
2955
|
+
multiple_keys_columns = [col for cols in search_key_names_by_type.values() for col in cols]
|
|
2956
|
+
other_columns = [col for col in df.columns if col not in multiple_keys_columns]
|
|
2957
|
+
exploded_dfs = []
|
|
2958
|
+
unnest_search_keys = []
|
|
2959
|
+
|
|
2960
|
+
for key_type, key_names in search_key_names_by_type.items():
|
|
2961
|
+
new_search_key = f"upgini_{key_type.name.lower()}_unnest"
|
|
2962
|
+
exploded_df = pd.melt(
|
|
2963
|
+
df, id_vars=other_columns, value_vars=key_names, var_name=SEARCH_KEY_UNNEST, value_name=new_search_key
|
|
2964
|
+
)
|
|
2965
|
+
exploded_dfs.append(exploded_df)
|
|
2966
|
+
for old_key in key_names:
|
|
2967
|
+
del search_keys[old_key]
|
|
2968
|
+
search_keys[new_search_key] = key_type
|
|
2969
|
+
unnest_search_keys.append(new_search_key)
|
|
2970
|
+
|
|
2971
|
+
df = pd.concat(exploded_dfs, ignore_index=True)
|
|
2972
|
+
return df, unnest_search_keys
|
|
2973
|
+
|
|
2924
2974
|
def __add_fit_system_record_id(
|
|
2925
|
-
self,
|
|
2975
|
+
self,
|
|
2976
|
+
df: pd.DataFrame,
|
|
2977
|
+
meaning_types: Dict[str, FileColumnMeaningType],
|
|
2978
|
+
search_keys: Dict[str, SearchKey],
|
|
2979
|
+
id_name: str,
|
|
2926
2980
|
) -> pd.DataFrame:
|
|
2927
2981
|
# save original order or rows
|
|
2928
2982
|
original_index_name = df.index.name
|
|
@@ -2971,19 +3025,23 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2971
3025
|
|
|
2972
3026
|
df = df.reset_index(drop=True).reset_index()
|
|
2973
3027
|
# system_record_id saves correct order for fit
|
|
2974
|
-
df = df.rename(columns={DEFAULT_INDEX:
|
|
3028
|
+
df = df.rename(columns={DEFAULT_INDEX: id_name})
|
|
2975
3029
|
|
|
2976
3030
|
# return original order
|
|
2977
3031
|
df = df.set_index(ORIGINAL_INDEX)
|
|
2978
3032
|
df.index.name = original_index_name
|
|
2979
3033
|
df = df.sort_values(by=original_order_name).drop(columns=original_order_name)
|
|
2980
3034
|
|
|
2981
|
-
meaning_types[
|
|
3035
|
+
meaning_types[id_name] = (
|
|
3036
|
+
FileColumnMeaningType.SYSTEM_RECORD_ID
|
|
3037
|
+
if id_name == SYSTEM_RECORD_ID
|
|
3038
|
+
else FileColumnMeaningType.ENTITY_SYSTEM_RECORD_ID
|
|
3039
|
+
)
|
|
2982
3040
|
return df
|
|
2983
3041
|
|
|
2984
3042
|
def __correct_target(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
2985
3043
|
target = df[self.TARGET_NAME]
|
|
2986
|
-
if is_string_dtype(target)
|
|
3044
|
+
if is_string_dtype(target):
|
|
2987
3045
|
maybe_numeric_target = pd.to_numeric(target, errors="coerce")
|
|
2988
3046
|
# If less than 5% is non numeric then leave this rows with NaN target and later it will be dropped
|
|
2989
3047
|
if maybe_numeric_target.isna().sum() <= _num_samples(df) * 0.05:
|
|
@@ -3033,7 +3091,10 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3033
3091
|
)
|
|
3034
3092
|
|
|
3035
3093
|
comparing_columns = X.columns if is_transform else df_with_original_index.columns
|
|
3036
|
-
dup_features = [
|
|
3094
|
+
dup_features = [
|
|
3095
|
+
c for c in comparing_columns
|
|
3096
|
+
if c in result_features.columns and c not in [SYSTEM_RECORD_ID, ENTITY_SYSTEM_RECORD_ID]
|
|
3097
|
+
]
|
|
3037
3098
|
if len(dup_features) > 0:
|
|
3038
3099
|
self.logger.warning(f"X contain columns with same name as returned from backend: {dup_features}")
|
|
3039
3100
|
raise ValidationError(self.bundle.get("returned_features_same_as_passed").format(dup_features))
|
|
@@ -3044,8 +3105,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3044
3105
|
result_features = pd.merge(
|
|
3045
3106
|
df_with_original_index,
|
|
3046
3107
|
result_features,
|
|
3047
|
-
|
|
3048
|
-
right_on=SYSTEM_RECORD_ID,
|
|
3108
|
+
on=ENTITY_SYSTEM_RECORD_ID,
|
|
3049
3109
|
how="left" if is_transform else "inner",
|
|
3050
3110
|
)
|
|
3051
3111
|
result_features = result_features.set_index(original_index_name or DEFAULT_INDEX)
|
|
@@ -3383,8 +3443,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3383
3443
|
valid_search_keys[column_name] = SearchKey.CUSTOM_KEY
|
|
3384
3444
|
else:
|
|
3385
3445
|
if x[column_name].isnull().all() or (
|
|
3386
|
-
|
|
3387
|
-
and (x[column_name].astype("string").str.strip() == "").all()
|
|
3446
|
+
is_string_dtype(x[column_name]) and (x[column_name].astype("string").str.strip() == "").all()
|
|
3388
3447
|
):
|
|
3389
3448
|
raise ValidationError(self.bundle.get("empty_search_key").format(column_name))
|
|
3390
3449
|
|
|
@@ -3426,13 +3485,13 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3426
3485
|
self.warning_counter.increment()
|
|
3427
3486
|
|
|
3428
3487
|
if len(valid_search_keys) == 1:
|
|
3429
|
-
|
|
3430
|
-
|
|
3431
|
-
|
|
3432
|
-
|
|
3433
|
-
|
|
3434
|
-
|
|
3435
|
-
|
|
3488
|
+
key, value = list(valid_search_keys.items())[0]
|
|
3489
|
+
# Show warning for country only if country is the only key
|
|
3490
|
+
if x[key].nunique() == 1:
|
|
3491
|
+
msg = self.bundle.get("single_constant_search_key").format(value, x[key].values[0])
|
|
3492
|
+
print(msg)
|
|
3493
|
+
self.logger.warning(msg)
|
|
3494
|
+
self.warning_counter.increment()
|
|
3436
3495
|
|
|
3437
3496
|
self.logger.info(f"Prepared search keys: {valid_search_keys}")
|
|
3438
3497
|
|
|
@@ -3542,61 +3601,68 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3542
3601
|
def check_need_detect(search_key: SearchKey):
|
|
3543
3602
|
return not is_transform or search_key in self.fit_search_keys.values()
|
|
3544
3603
|
|
|
3545
|
-
if SearchKey.POSTAL_CODE not in search_keys.values() and check_need_detect(SearchKey.POSTAL_CODE):
|
|
3546
|
-
|
|
3547
|
-
|
|
3548
|
-
|
|
3549
|
-
|
|
3550
|
-
|
|
3604
|
+
# if SearchKey.POSTAL_CODE not in search_keys.values() and check_need_detect(SearchKey.POSTAL_CODE):
|
|
3605
|
+
if check_need_detect(SearchKey.POSTAL_CODE):
|
|
3606
|
+
maybe_keys = PostalCodeSearchKeyDetector().get_search_key_columns(sample, search_keys)
|
|
3607
|
+
if maybe_keys:
|
|
3608
|
+
new_keys = {key: SearchKey.POSTAL_CODE for key in maybe_keys}
|
|
3609
|
+
search_keys.update(new_keys)
|
|
3610
|
+
self.autodetected_search_keys.update(new_keys)
|
|
3611
|
+
self.logger.info(f"Autodetected search key POSTAL_CODE in column {maybe_keys}")
|
|
3551
3612
|
if not silent_mode:
|
|
3552
|
-
print(self.bundle.get("postal_code_detected").format(
|
|
3613
|
+
print(self.bundle.get("postal_code_detected").format(maybe_keys))
|
|
3553
3614
|
|
|
3554
3615
|
if (
|
|
3555
3616
|
SearchKey.COUNTRY not in search_keys.values()
|
|
3556
3617
|
and self.country_code is None
|
|
3557
3618
|
and check_need_detect(SearchKey.COUNTRY)
|
|
3558
3619
|
):
|
|
3559
|
-
maybe_key = CountrySearchKeyDetector().
|
|
3560
|
-
if maybe_key
|
|
3561
|
-
search_keys[maybe_key] = SearchKey.COUNTRY
|
|
3562
|
-
self.autodetected_search_keys[maybe_key] = SearchKey.COUNTRY
|
|
3620
|
+
maybe_key = CountrySearchKeyDetector().get_search_key_columns(sample, search_keys)
|
|
3621
|
+
if maybe_key:
|
|
3622
|
+
search_keys[maybe_key[0]] = SearchKey.COUNTRY
|
|
3623
|
+
self.autodetected_search_keys[maybe_key[0]] = SearchKey.COUNTRY
|
|
3563
3624
|
self.logger.info(f"Autodetected search key COUNTRY in column {maybe_key}")
|
|
3564
3625
|
if not silent_mode:
|
|
3565
3626
|
print(self.bundle.get("country_detected").format(maybe_key))
|
|
3566
3627
|
|
|
3567
3628
|
if (
|
|
3568
|
-
SearchKey.EMAIL not in search_keys.values()
|
|
3569
|
-
|
|
3629
|
+
# SearchKey.EMAIL not in search_keys.values()
|
|
3630
|
+
SearchKey.HEM not in search_keys.values()
|
|
3570
3631
|
and check_need_detect(SearchKey.HEM)
|
|
3571
3632
|
):
|
|
3572
|
-
|
|
3573
|
-
if
|
|
3633
|
+
maybe_keys = EmailSearchKeyDetector().get_search_key_columns(sample, search_keys)
|
|
3634
|
+
if maybe_keys:
|
|
3574
3635
|
if self.__is_registered or is_demo_dataset:
|
|
3575
|
-
|
|
3576
|
-
|
|
3577
|
-
self.
|
|
3636
|
+
new_keys = {key: SearchKey.EMAIL for key in maybe_keys}
|
|
3637
|
+
search_keys.update(new_keys)
|
|
3638
|
+
self.autodetected_search_keys.update(new_keys)
|
|
3639
|
+
self.logger.info(f"Autodetected search key EMAIL in column {maybe_keys}")
|
|
3578
3640
|
if not silent_mode:
|
|
3579
|
-
print(self.bundle.get("email_detected").format(
|
|
3641
|
+
print(self.bundle.get("email_detected").format(maybe_keys))
|
|
3580
3642
|
else:
|
|
3581
3643
|
self.logger.warning(
|
|
3582
|
-
f"Autodetected search key EMAIL in column {
|
|
3644
|
+
f"Autodetected search key EMAIL in column {maybe_keys}."
|
|
3645
|
+
" But not used because not registered user"
|
|
3583
3646
|
)
|
|
3584
3647
|
if not silent_mode:
|
|
3585
|
-
print(self.bundle.get("email_detected_not_registered").format(
|
|
3648
|
+
print(self.bundle.get("email_detected_not_registered").format(maybe_keys))
|
|
3586
3649
|
self.warning_counter.increment()
|
|
3587
3650
|
|
|
3588
|
-
if SearchKey.PHONE not in search_keys.values() and check_need_detect(SearchKey.PHONE):
|
|
3589
|
-
|
|
3590
|
-
|
|
3651
|
+
# if SearchKey.PHONE not in search_keys.values() and check_need_detect(SearchKey.PHONE):
|
|
3652
|
+
if check_need_detect(SearchKey.PHONE):
|
|
3653
|
+
maybe_keys = PhoneSearchKeyDetector().get_search_key_columns(sample, search_keys)
|
|
3654
|
+
if maybe_keys:
|
|
3591
3655
|
if self.__is_registered or is_demo_dataset:
|
|
3592
|
-
|
|
3593
|
-
|
|
3594
|
-
self.
|
|
3656
|
+
new_keys = {key: SearchKey.PHONE for key in maybe_keys}
|
|
3657
|
+
search_keys.update(new_keys)
|
|
3658
|
+
self.autodetected_search_keys.update(new_keys)
|
|
3659
|
+
self.logger.info(f"Autodetected search key PHONE in column {maybe_keys}")
|
|
3595
3660
|
if not silent_mode:
|
|
3596
|
-
print(self.bundle.get("phone_detected").format(
|
|
3661
|
+
print(self.bundle.get("phone_detected").format(maybe_keys))
|
|
3597
3662
|
else:
|
|
3598
3663
|
self.logger.warning(
|
|
3599
|
-
f"Autodetected search key PHONE in column {
|
|
3664
|
+
f"Autodetected search key PHONE in column {maybe_keys}. "
|
|
3665
|
+
"But not used because not registered user"
|
|
3600
3666
|
)
|
|
3601
3667
|
if not silent_mode:
|
|
3602
3668
|
print(self.bundle.get("phone_detected_not_registered"))
|