upgini 1.1.275a1__py3-none-any.whl → 1.1.276__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- upgini/ads.py +6 -2
- upgini/autofe/date.py +9 -2
- upgini/data_source/data_source_publisher.py +1 -1
- upgini/dataset.py +6 -13
- upgini/features_enricher.py +156 -220
- upgini/metadata.py +1 -9
- upgini/metrics.py +12 -0
- upgini/normalizer/phone_normalizer.py +2 -2
- upgini/resource_bundle/strings.properties +2 -2
- upgini/utils/__init__.py +3 -2
- upgini/utils/base_search_key_detector.py +12 -14
- upgini/utils/country_utils.py +2 -2
- upgini/utils/datetime_utils.py +7 -4
- upgini/utils/deduplicate_utils.py +1 -11
- upgini/utils/email_utils.py +2 -7
- upgini/utils/features_validator.py +2 -1
- upgini/utils/target_utils.py +1 -1
- upgini/utils/track_info.py +25 -13
- {upgini-1.1.275a1.dist-info → upgini-1.1.276.dist-info}/METADATA +2 -2
- {upgini-1.1.275a1.dist-info → upgini-1.1.276.dist-info}/RECORD +23 -23
- {upgini-1.1.275a1.dist-info → upgini-1.1.276.dist-info}/LICENSE +0 -0
- {upgini-1.1.275a1.dist-info → upgini-1.1.276.dist-info}/WHEEL +0 -0
- {upgini-1.1.275a1.dist-info → upgini-1.1.276.dist-info}/top_level.txt +0 -0
upgini/features_enricher.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import dataclasses
|
|
2
|
+
import datetime
|
|
2
3
|
import gc
|
|
3
4
|
import hashlib
|
|
4
5
|
import itertools
|
|
@@ -10,7 +11,6 @@ import sys
|
|
|
10
11
|
import tempfile
|
|
11
12
|
import time
|
|
12
13
|
import uuid
|
|
13
|
-
from collections import Counter
|
|
14
14
|
from dataclasses import dataclass
|
|
15
15
|
from threading import Thread
|
|
16
16
|
from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union
|
|
@@ -21,6 +21,7 @@ from pandas.api.types import (
|
|
|
21
21
|
is_bool,
|
|
22
22
|
is_datetime64_any_dtype,
|
|
23
23
|
is_numeric_dtype,
|
|
24
|
+
is_object_dtype,
|
|
24
25
|
is_period_dtype,
|
|
25
26
|
is_string_dtype,
|
|
26
27
|
)
|
|
@@ -44,11 +45,9 @@ from upgini.mdc import MDC
|
|
|
44
45
|
from upgini.metadata import (
|
|
45
46
|
COUNTRY,
|
|
46
47
|
DEFAULT_INDEX,
|
|
47
|
-
ENTITY_SYSTEM_RECORD_ID,
|
|
48
48
|
EVAL_SET_INDEX,
|
|
49
49
|
ORIGINAL_INDEX,
|
|
50
50
|
RENAMED_INDEX,
|
|
51
|
-
SEARCH_KEY_UNNEST,
|
|
52
51
|
SORT_ID,
|
|
53
52
|
SYSTEM_RECORD_ID,
|
|
54
53
|
TARGET,
|
|
@@ -149,6 +148,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
149
148
|
"""
|
|
150
149
|
|
|
151
150
|
TARGET_NAME = "target"
|
|
151
|
+
CURRENT_DATE = "current_date"
|
|
152
152
|
RANDOM_STATE = 42
|
|
153
153
|
CALCULATE_METRICS_THRESHOLD = 50_000_000
|
|
154
154
|
CALCULATE_METRICS_MIN_THRESHOLD = 500
|
|
@@ -210,6 +210,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
210
210
|
client_ip: Optional[str] = None,
|
|
211
211
|
client_visitorid: Optional[str] = None,
|
|
212
212
|
custom_bundle_config: Optional[str] = None,
|
|
213
|
+
add_date_if_missing: bool = True,
|
|
213
214
|
**kwargs,
|
|
214
215
|
):
|
|
215
216
|
self.bundle = get_custom_bundle(custom_bundle_config)
|
|
@@ -320,6 +321,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
320
321
|
self.raise_validation_error = raise_validation_error
|
|
321
322
|
self.exclude_columns = exclude_columns
|
|
322
323
|
self.baseline_score_column = baseline_score_column
|
|
324
|
+
self.add_date_if_missing = add_date_if_missing
|
|
323
325
|
|
|
324
326
|
def _get_api_key(self):
|
|
325
327
|
return self._api_key
|
|
@@ -423,6 +425,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
423
425
|
|
|
424
426
|
self.__validate_search_keys(self.search_keys, self.search_id)
|
|
425
427
|
|
|
428
|
+
# Validate client estimator params
|
|
429
|
+
self._get_client_cat_features(estimator, X, self.search_keys)
|
|
430
|
+
|
|
426
431
|
try:
|
|
427
432
|
self.X = X
|
|
428
433
|
self.y = y
|
|
@@ -816,6 +821,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
816
821
|
trace_id = trace_id or str(uuid.uuid4())
|
|
817
822
|
start_time = time.time()
|
|
818
823
|
with MDC(trace_id=trace_id):
|
|
824
|
+
self.logger.info("Start calculate metrics")
|
|
819
825
|
if len(args) > 0:
|
|
820
826
|
msg = f"WARNING: Unsupported positional arguments for calculate_metrics: {args}"
|
|
821
827
|
self.logger.warning(msg)
|
|
@@ -867,22 +873,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
867
873
|
self.__display_support_link(msg)
|
|
868
874
|
return None
|
|
869
875
|
|
|
870
|
-
cat_features =
|
|
871
|
-
|
|
872
|
-
|
|
873
|
-
estimator is not None
|
|
874
|
-
and hasattr(estimator, "get_param")
|
|
875
|
-
and estimator.get_param("cat_features") is not None
|
|
876
|
-
):
|
|
877
|
-
cat_features = estimator.get_param("cat_features")
|
|
878
|
-
if len(cat_features) > 0 and isinstance(cat_features[0], int):
|
|
879
|
-
cat_features = [effective_X.columns[i] for i in cat_features]
|
|
880
|
-
for cat_feature in cat_features:
|
|
881
|
-
if cat_feature in self.search_keys:
|
|
882
|
-
if self.search_keys[cat_feature] in [SearchKey.COUNTRY, SearchKey.POSTAL_CODE]:
|
|
883
|
-
search_keys_for_metrics.append(cat_feature)
|
|
884
|
-
else:
|
|
885
|
-
raise ValidationError(self.bundle.get("cat_feature_search_key").format(cat_feature))
|
|
876
|
+
cat_features, search_keys_for_metrics = self._get_client_cat_features(
|
|
877
|
+
estimator, effective_X, self.search_keys
|
|
878
|
+
)
|
|
886
879
|
|
|
887
880
|
prepared_data = self._prepare_data_for_metrics(
|
|
888
881
|
trace_id=trace_id,
|
|
@@ -897,6 +890,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
897
890
|
search_keys_for_metrics=search_keys_for_metrics,
|
|
898
891
|
progress_bar=progress_bar,
|
|
899
892
|
progress_callback=progress_callback,
|
|
893
|
+
cat_features=cat_features,
|
|
900
894
|
)
|
|
901
895
|
if prepared_data is None:
|
|
902
896
|
return None
|
|
@@ -1184,8 +1178,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1184
1178
|
search_keys = self.search_keys.copy()
|
|
1185
1179
|
search_keys = self.__prepare_search_keys(x, search_keys, is_demo_dataset, is_transform=True, silent_mode=True)
|
|
1186
1180
|
|
|
1187
|
-
unnest_search_keys = []
|
|
1188
|
-
|
|
1189
1181
|
extended_X = x.copy()
|
|
1190
1182
|
generated_features = []
|
|
1191
1183
|
date_column = self._get_date_column(search_keys)
|
|
@@ -1196,7 +1188,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1196
1188
|
email_column = self._get_email_column(search_keys)
|
|
1197
1189
|
hem_column = self._get_hem_column(search_keys)
|
|
1198
1190
|
if email_column:
|
|
1199
|
-
converter = EmailSearchKeyConverter(email_column, hem_column, search_keys,
|
|
1191
|
+
converter = EmailSearchKeyConverter(email_column, hem_column, search_keys, self.logger)
|
|
1200
1192
|
extended_X = converter.convert(extended_X)
|
|
1201
1193
|
generated_features.extend(converter.generated_features)
|
|
1202
1194
|
if (
|
|
@@ -1274,6 +1266,29 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1274
1266
|
|
|
1275
1267
|
return _cv, groups
|
|
1276
1268
|
|
|
1269
|
+
def _get_client_cat_features(
|
|
1270
|
+
self, estimator: Optional[Any], X: pd.DataFrame, search_keys: Dict[str, SearchKey]
|
|
1271
|
+
) -> Optional[List[str]]:
|
|
1272
|
+
cat_features = None
|
|
1273
|
+
search_keys_for_metrics = []
|
|
1274
|
+
if (
|
|
1275
|
+
estimator is not None
|
|
1276
|
+
and hasattr(estimator, "get_param")
|
|
1277
|
+
and estimator.get_param("cat_features") is not None
|
|
1278
|
+
):
|
|
1279
|
+
cat_features = estimator.get_param("cat_features")
|
|
1280
|
+
if len(cat_features) > 0:
|
|
1281
|
+
if all([isinstance(f, int) for f in cat_features]):
|
|
1282
|
+
cat_features = [X.columns[i] for i in cat_features]
|
|
1283
|
+
self.logger.info(f"Collected categorical features {cat_features} from user estimator")
|
|
1284
|
+
for cat_feature in cat_features:
|
|
1285
|
+
if cat_feature in search_keys:
|
|
1286
|
+
if search_keys[cat_feature] in [SearchKey.COUNTRY, SearchKey.POSTAL_CODE]:
|
|
1287
|
+
search_keys_for_metrics.append(cat_feature)
|
|
1288
|
+
else:
|
|
1289
|
+
raise ValidationError(self.bundle.get("cat_feature_search_key").format(cat_feature))
|
|
1290
|
+
return cat_features, search_keys_for_metrics
|
|
1291
|
+
|
|
1277
1292
|
def _prepare_data_for_metrics(
|
|
1278
1293
|
self,
|
|
1279
1294
|
trace_id: str,
|
|
@@ -1288,6 +1303,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1288
1303
|
search_keys_for_metrics: Optional[List[str]] = None,
|
|
1289
1304
|
progress_bar: Optional[ProgressBar] = None,
|
|
1290
1305
|
progress_callback: Optional[Callable[[SearchProgress], Any]] = None,
|
|
1306
|
+
cat_features: Optional[List[str]] = None,
|
|
1291
1307
|
):
|
|
1292
1308
|
is_input_same_as_fit, X, y, eval_set = self._is_input_same_as_fit(X, y, eval_set)
|
|
1293
1309
|
is_demo_dataset = hash_input(X, y, eval_set) in DEMO_DATASET_HASHES
|
|
@@ -1345,9 +1361,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1345
1361
|
|
|
1346
1362
|
# Detect and drop high cardinality columns in train
|
|
1347
1363
|
columns_with_high_cardinality = FeaturesValidator.find_high_cardinality(fitting_X)
|
|
1348
|
-
|
|
1349
|
-
|
|
1350
|
-
]
|
|
1364
|
+
non_excluding_columns = (self.generate_features or []) + (cat_features or [])
|
|
1365
|
+
columns_with_high_cardinality = [c for c in columns_with_high_cardinality if c not in non_excluding_columns]
|
|
1351
1366
|
if len(columns_with_high_cardinality) > 0:
|
|
1352
1367
|
self.logger.warning(
|
|
1353
1368
|
f"High cardinality columns {columns_with_high_cardinality} will be dropped for metrics calculation"
|
|
@@ -1809,10 +1824,11 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1809
1824
|
else:
|
|
1810
1825
|
features_section = ""
|
|
1811
1826
|
|
|
1812
|
-
|
|
1827
|
+
search_id = self._search_task.search_task_id
|
|
1828
|
+
api_example = f"""curl 'https://search.upgini.com/online/api/http_inference_trigger?search_id={search_id}' \\
|
|
1813
1829
|
-H 'Authorization: {self.api_key}' \\
|
|
1814
1830
|
-H 'Content-Type: application/json' \\
|
|
1815
|
-
-d '{{"
|
|
1831
|
+
-d '{{"search_keys": {keys}{features_section}}}'"""
|
|
1816
1832
|
return api_example
|
|
1817
1833
|
|
|
1818
1834
|
def _get_copy_of_runtime_parameters(self) -> RuntimeParameters:
|
|
@@ -1907,38 +1923,13 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1907
1923
|
generated_features.extend(converter.generated_features)
|
|
1908
1924
|
else:
|
|
1909
1925
|
self.logger.info("Input dataset hasn't date column")
|
|
1910
|
-
|
|
1911
|
-
|
|
1912
|
-
original_features_for_transform = []
|
|
1913
|
-
runtime_parameters = self._get_copy_of_runtime_parameters()
|
|
1914
|
-
features_not_to_pass = [column for column in df.columns if column not in search_keys.keys()]
|
|
1915
|
-
if len(features_not_to_pass) > 0:
|
|
1916
|
-
# Pass only features that need for transform
|
|
1917
|
-
features_for_transform = self._search_task.get_features_for_transform()
|
|
1918
|
-
if features_for_transform is not None and len(features_for_transform) > 0:
|
|
1919
|
-
file_metadata = self._search_task.get_file_metadata(trace_id)
|
|
1920
|
-
original_features_for_transform = [
|
|
1921
|
-
c.originalName or c.name for c in file_metadata.columns if c.name in features_for_transform
|
|
1922
|
-
]
|
|
1923
|
-
|
|
1924
|
-
runtime_parameters.properties["features_for_embeddings"] = ",".join(features_for_transform)
|
|
1925
|
-
|
|
1926
|
-
columns_for_system_record_id = sorted(list(search_keys.keys()) + (original_features_for_transform))
|
|
1927
|
-
|
|
1928
|
-
df[ENTITY_SYSTEM_RECORD_ID] = pd.util.hash_pandas_object(
|
|
1929
|
-
df[columns_for_system_record_id], index=False
|
|
1930
|
-
).astype("Float64")
|
|
1931
|
-
|
|
1932
|
-
# Explode multiple search keys
|
|
1933
|
-
df, unnest_search_keys = self._explode_multiple_search_keys(df, search_keys)
|
|
1934
|
-
|
|
1926
|
+
if self.add_date_if_missing:
|
|
1927
|
+
df = self._add_current_date_as_key(df, search_keys, self.logger, self.bundle)
|
|
1935
1928
|
email_column = self._get_email_column(search_keys)
|
|
1936
1929
|
hem_column = self._get_hem_column(search_keys)
|
|
1937
1930
|
email_converted_to_hem = False
|
|
1938
1931
|
if email_column:
|
|
1939
|
-
converter = EmailSearchKeyConverter(
|
|
1940
|
-
email_column, hem_column, search_keys, unnest_search_keys, self.logger
|
|
1941
|
-
)
|
|
1932
|
+
converter = EmailSearchKeyConverter(email_column, hem_column, search_keys, self.logger)
|
|
1942
1933
|
df = converter.convert(df)
|
|
1943
1934
|
generated_features.extend(converter.generated_features)
|
|
1944
1935
|
email_converted_to_hem = converter.email_converted_to_hem
|
|
@@ -1952,21 +1943,30 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1952
1943
|
generated_features = [f for f in generated_features if f in self.fit_generated_features]
|
|
1953
1944
|
|
|
1954
1945
|
meaning_types = {col: key.value for col, key in search_keys.items()}
|
|
1955
|
-
|
|
1956
|
-
for col in original_features_for_transform:
|
|
1957
|
-
meaning_types[col] = FileColumnMeaningType.FEATURE
|
|
1958
|
-
features_not_to_pass = [column for column in features_not_to_pass if column not in search_keys.keys()]
|
|
1946
|
+
non_keys_columns = [column for column in df.columns if column not in search_keys.keys()]
|
|
1959
1947
|
|
|
1960
1948
|
if email_converted_to_hem:
|
|
1961
|
-
|
|
1949
|
+
non_keys_columns.append(email_column)
|
|
1950
|
+
|
|
1951
|
+
# Don't pass features in backend on transform
|
|
1952
|
+
original_features_for_transform = None
|
|
1953
|
+
runtime_parameters = self._get_copy_of_runtime_parameters()
|
|
1954
|
+
if len(non_keys_columns) > 0:
|
|
1955
|
+
# Pass only features that need for transform
|
|
1956
|
+
features_for_transform = self._search_task.get_features_for_transform()
|
|
1957
|
+
if features_for_transform is not None and len(features_for_transform) > 0:
|
|
1958
|
+
file_metadata = self._search_task.get_file_metadata(trace_id)
|
|
1959
|
+
original_features_for_transform = [
|
|
1960
|
+
c.originalName or c.name for c in file_metadata.columns if c.name in features_for_transform
|
|
1961
|
+
]
|
|
1962
|
+
non_keys_columns = [c for c in non_keys_columns if c not in original_features_for_transform]
|
|
1962
1963
|
|
|
1963
|
-
|
|
1964
|
-
columns_for_system_record_id = sorted(list(search_keys.keys()) + (original_features_for_transform))
|
|
1964
|
+
runtime_parameters.properties["features_for_embeddings"] = ",".join(features_for_transform)
|
|
1965
1965
|
|
|
1966
1966
|
if add_fit_system_record_id:
|
|
1967
1967
|
df = self.__add_fit_system_record_id(df, dict(), search_keys)
|
|
1968
1968
|
df = df.rename(columns={SYSTEM_RECORD_ID: SORT_ID})
|
|
1969
|
-
|
|
1969
|
+
non_keys_columns.append(SORT_ID)
|
|
1970
1970
|
|
|
1971
1971
|
columns_for_system_record_id = sorted(list(search_keys.keys()) + (original_features_for_transform or []))
|
|
1972
1972
|
|
|
@@ -1974,19 +1974,16 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1974
1974
|
"Float64"
|
|
1975
1975
|
)
|
|
1976
1976
|
meaning_types[SYSTEM_RECORD_ID] = FileColumnMeaningType.SYSTEM_RECORD_ID
|
|
1977
|
-
meaning_types[ENTITY_SYSTEM_RECORD_ID] = FileColumnMeaningType.ENTITY_SYSTEM_RECORD_ID
|
|
1978
|
-
if SEARCH_KEY_UNNEST in df.columns:
|
|
1979
|
-
meaning_types[SEARCH_KEY_UNNEST] = FileColumnMeaningType.UNNEST_KEY
|
|
1980
1977
|
|
|
1981
1978
|
df = df.reset_index(drop=True)
|
|
1982
|
-
system_columns_with_original_index = [SYSTEM_RECORD_ID
|
|
1979
|
+
system_columns_with_original_index = [SYSTEM_RECORD_ID] + generated_features
|
|
1983
1980
|
if add_fit_system_record_id:
|
|
1984
1981
|
system_columns_with_original_index.append(SORT_ID)
|
|
1985
1982
|
df_with_original_index = df[system_columns_with_original_index].copy()
|
|
1986
1983
|
|
|
1987
1984
|
combined_search_keys = combine_search_keys(search_keys.keys())
|
|
1988
1985
|
|
|
1989
|
-
df_without_features = df.drop(columns=
|
|
1986
|
+
df_without_features = df.drop(columns=non_keys_columns)
|
|
1990
1987
|
|
|
1991
1988
|
df_without_features = clean_full_duplicates(
|
|
1992
1989
|
df_without_features, self.logger, silent=silent_mode, bundle=self.bundle
|
|
@@ -2142,14 +2139,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2142
2139
|
|
|
2143
2140
|
key_types = search_keys.values()
|
|
2144
2141
|
|
|
2145
|
-
# Multiple search keys allowed only for PHONE, IP, POSTAL_CODE, EMAIL, HEM
|
|
2146
|
-
multi_keys = [key for key, count in Counter(key_types).items() if count > 1]
|
|
2147
|
-
for multi_key in multi_keys:
|
|
2148
|
-
if multi_key not in [SearchKey.PHONE, SearchKey.IP, SearchKey.POSTAL_CODE, SearchKey.EMAIL, SearchKey.HEM]:
|
|
2149
|
-
msg = self.bundle.get("unsupported_multi_key").format(multi_key)
|
|
2150
|
-
self.logger.warning(msg)
|
|
2151
|
-
raise ValidationError(msg)
|
|
2152
|
-
|
|
2153
2142
|
if SearchKey.DATE in key_types and SearchKey.DATETIME in key_types:
|
|
2154
2143
|
msg = self.bundle.get("date_and_datetime_simultanious")
|
|
2155
2144
|
self.logger.warning(msg)
|
|
@@ -2165,11 +2154,11 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2165
2154
|
self.logger.warning(msg)
|
|
2166
2155
|
raise ValidationError(msg)
|
|
2167
2156
|
|
|
2168
|
-
|
|
2169
|
-
|
|
2170
|
-
|
|
2171
|
-
|
|
2172
|
-
|
|
2157
|
+
for key_type in SearchKey.__members__.values():
|
|
2158
|
+
if key_type != SearchKey.CUSTOM_KEY and list(key_types).count(key_type) > 1:
|
|
2159
|
+
msg = self.bundle.get("multiple_search_key").format(key_type)
|
|
2160
|
+
self.logger.warning(msg)
|
|
2161
|
+
raise ValidationError(msg)
|
|
2173
2162
|
|
|
2174
2163
|
# non_personal_keys = set(SearchKey.__members__.values()) - set(SearchKey.personal_keys())
|
|
2175
2164
|
# if (
|
|
@@ -2305,7 +2294,16 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2305
2294
|
self.fit_generated_features.extend(converter.generated_features)
|
|
2306
2295
|
else:
|
|
2307
2296
|
self.logger.info("Input dataset hasn't date column")
|
|
2308
|
-
|
|
2297
|
+
if self.add_date_if_missing:
|
|
2298
|
+
df = self._add_current_date_as_key(df, self.fit_search_keys, self.logger, self.bundle)
|
|
2299
|
+
email_column = self._get_email_column(self.fit_search_keys)
|
|
2300
|
+
hem_column = self._get_hem_column(self.fit_search_keys)
|
|
2301
|
+
email_converted_to_hem = False
|
|
2302
|
+
if email_column:
|
|
2303
|
+
converter = EmailSearchKeyConverter(email_column, hem_column, self.fit_search_keys, self.logger)
|
|
2304
|
+
df = converter.convert(df)
|
|
2305
|
+
self.fit_generated_features.extend(converter.generated_features)
|
|
2306
|
+
email_converted_to_hem = converter.email_converted_to_hem
|
|
2309
2307
|
if (
|
|
2310
2308
|
self.detect_missing_search_keys
|
|
2311
2309
|
and list(self.fit_search_keys.values()) == [SearchKey.DATE]
|
|
@@ -2314,37 +2312,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2314
2312
|
converter = IpToCountrySearchKeyConverter(self.fit_search_keys, self.logger)
|
|
2315
2313
|
df = converter.convert(df)
|
|
2316
2314
|
|
|
2317
|
-
# Explode multiple search keys
|
|
2318
2315
|
non_feature_columns = [self.TARGET_NAME, EVAL_SET_INDEX] + list(self.fit_search_keys.keys())
|
|
2319
|
-
meaning_types = {
|
|
2320
|
-
**{col: key.value for col, key in self.fit_search_keys.items()},
|
|
2321
|
-
**{str(c): FileColumnMeaningType.FEATURE for c in df.columns if c not in non_feature_columns},
|
|
2322
|
-
}
|
|
2323
|
-
meaning_types[self.TARGET_NAME] = FileColumnMeaningType.TARGET
|
|
2324
|
-
if eval_set is not None and len(eval_set) > 0:
|
|
2325
|
-
meaning_types[EVAL_SET_INDEX] = FileColumnMeaningType.EVAL_SET_INDEX
|
|
2326
|
-
df = self.__add_fit_system_record_id(df, meaning_types, self.fit_search_keys, ENTITY_SYSTEM_RECORD_ID)
|
|
2327
|
-
|
|
2328
|
-
# TODO check that this is correct for enrichment
|
|
2329
|
-
self.df_with_original_index = df.copy()
|
|
2330
|
-
|
|
2331
|
-
df, unnest_search_keys = self._explode_multiple_search_keys(df, self.fit_search_keys)
|
|
2332
|
-
|
|
2333
|
-
# Convert EMAIL to HEM after unnesting to do it only with one column
|
|
2334
|
-
email_column = self._get_email_column(self.fit_search_keys)
|
|
2335
|
-
hem_column = self._get_hem_column(self.fit_search_keys)
|
|
2336
|
-
email_converted_to_hem = False
|
|
2337
|
-
if email_column:
|
|
2338
|
-
converter = EmailSearchKeyConverter(
|
|
2339
|
-
email_column, hem_column, self.fit_search_keys, unnest_search_keys, self.logger
|
|
2340
|
-
)
|
|
2341
|
-
df = converter.convert(df)
|
|
2342
|
-
self.fit_generated_features.extend(converter.generated_features)
|
|
2343
|
-
email_converted_to_hem = converter.email_converted_to_hem
|
|
2344
|
-
|
|
2345
|
-
non_feature_columns = [self.TARGET_NAME, EVAL_SET_INDEX, ENTITY_SYSTEM_RECORD_ID, SEARCH_KEY_UNNEST] + list(
|
|
2346
|
-
self.fit_search_keys.keys()
|
|
2347
|
-
)
|
|
2348
2316
|
if email_converted_to_hem:
|
|
2349
2317
|
non_feature_columns.append(email_column)
|
|
2350
2318
|
if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
|
|
@@ -2368,14 +2336,12 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2368
2336
|
**{str(c): FileColumnMeaningType.FEATURE for c in df.columns if c not in non_feature_columns},
|
|
2369
2337
|
}
|
|
2370
2338
|
meaning_types[self.TARGET_NAME] = FileColumnMeaningType.TARGET
|
|
2371
|
-
meaning_types[ENTITY_SYSTEM_RECORD_ID] = FileColumnMeaningType.ENTITY_SYSTEM_RECORD_ID
|
|
2372
|
-
if SEARCH_KEY_UNNEST in df.columns:
|
|
2373
|
-
meaning_types[SEARCH_KEY_UNNEST] = FileColumnMeaningType.UNNEST_KEY
|
|
2374
2339
|
if eval_set is not None and len(eval_set) > 0:
|
|
2375
2340
|
meaning_types[EVAL_SET_INDEX] = FileColumnMeaningType.EVAL_SET_INDEX
|
|
2376
2341
|
|
|
2377
|
-
df = self.__add_fit_system_record_id(df, meaning_types, self.fit_search_keys
|
|
2342
|
+
df = self.__add_fit_system_record_id(df, meaning_types, self.fit_search_keys)
|
|
2378
2343
|
|
|
2344
|
+
self.df_with_original_index = df.copy()
|
|
2379
2345
|
df = df.reset_index(drop=True).sort_values(by=SYSTEM_RECORD_ID).reset_index(drop=True)
|
|
2380
2346
|
|
|
2381
2347
|
combined_search_keys = combine_search_keys(self.fit_search_keys.keys())
|
|
@@ -2383,15 +2349,14 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2383
2349
|
dataset = Dataset(
|
|
2384
2350
|
"tds_" + str(uuid.uuid4()),
|
|
2385
2351
|
df=df,
|
|
2386
|
-
meaning_types=meaning_types,
|
|
2387
|
-
search_keys=combined_search_keys,
|
|
2388
|
-
unnest_search_keys=unnest_search_keys,
|
|
2389
2352
|
model_task_type=model_task_type,
|
|
2390
2353
|
date_format=self.date_format,
|
|
2391
2354
|
random_state=self.random_state,
|
|
2392
2355
|
rest_client=self.rest_client,
|
|
2393
2356
|
logger=self.logger,
|
|
2394
2357
|
)
|
|
2358
|
+
dataset.meaning_types = meaning_types
|
|
2359
|
+
dataset.search_keys = combined_search_keys
|
|
2395
2360
|
if email_converted_to_hem:
|
|
2396
2361
|
dataset.ignore_columns = [email_column]
|
|
2397
2362
|
|
|
@@ -2911,6 +2876,25 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2911
2876
|
if t in [SearchKey.DATE, SearchKey.DATETIME]:
|
|
2912
2877
|
return col
|
|
2913
2878
|
|
|
2879
|
+
@staticmethod
|
|
2880
|
+
def _add_current_date_as_key(
|
|
2881
|
+
df: pd.DataFrame, search_keys: Dict[str, SearchKey], logger: logging.Logger, bundle: ResourceBundle
|
|
2882
|
+
) -> pd.DataFrame:
|
|
2883
|
+
if (
|
|
2884
|
+
set(search_keys.values()) == {SearchKey.PHONE}
|
|
2885
|
+
or set(search_keys.values()) == {SearchKey.EMAIL}
|
|
2886
|
+
or set(search_keys.values()) == {SearchKey.HEM}
|
|
2887
|
+
or set(search_keys.values()) == {SearchKey.COUNTRY, SearchKey.POSTAL_CODE}
|
|
2888
|
+
):
|
|
2889
|
+
msg = bundle.get("current_date_added")
|
|
2890
|
+
print(msg)
|
|
2891
|
+
logger.warning(msg)
|
|
2892
|
+
df[FeaturesEnricher.CURRENT_DATE] = datetime.date.today()
|
|
2893
|
+
search_keys[FeaturesEnricher.CURRENT_DATE] = SearchKey.DATE
|
|
2894
|
+
converter = DateTimeSearchKeyConverter(FeaturesEnricher.CURRENT_DATE, None, logger, bundle)
|
|
2895
|
+
df = converter.convert(df)
|
|
2896
|
+
return df
|
|
2897
|
+
|
|
2914
2898
|
@staticmethod
|
|
2915
2899
|
def _get_group_columns(df: pd.DataFrame, search_keys: Dict[str, SearchKey]) -> List[str]:
|
|
2916
2900
|
return [
|
|
@@ -2921,19 +2905,15 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2921
2905
|
|
|
2922
2906
|
@staticmethod
|
|
2923
2907
|
def _get_email_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
|
|
2924
|
-
|
|
2925
|
-
|
|
2926
|
-
|
|
2927
|
-
if len(cols) == 1:
|
|
2928
|
-
return cols[0]
|
|
2908
|
+
for col, t in search_keys.items():
|
|
2909
|
+
if t == SearchKey.EMAIL:
|
|
2910
|
+
return col
|
|
2929
2911
|
|
|
2930
2912
|
@staticmethod
|
|
2931
2913
|
def _get_hem_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
|
|
2932
|
-
|
|
2933
|
-
|
|
2934
|
-
|
|
2935
|
-
if len(cols) == 1:
|
|
2936
|
-
return cols[0]
|
|
2914
|
+
for col, t in search_keys.items():
|
|
2915
|
+
if t == SearchKey.HEM:
|
|
2916
|
+
return col
|
|
2937
2917
|
|
|
2938
2918
|
@staticmethod
|
|
2939
2919
|
def _get_phone_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
|
|
@@ -2941,42 +2921,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2941
2921
|
if t == SearchKey.PHONE:
|
|
2942
2922
|
return col
|
|
2943
2923
|
|
|
2944
|
-
def _explode_multiple_search_keys(self, df: pd.DataFrame, search_keys: Dict[str, SearchKey]) -> pd.DataFrame:
|
|
2945
|
-
# find groups of multiple search keys
|
|
2946
|
-
search_key_names_by_type: Dict[SearchKey, str] = dict()
|
|
2947
|
-
for key_name, key_type in search_keys.items():
|
|
2948
|
-
search_key_names_by_type[key_type] = search_key_names_by_type.get(key_type, []) + [key_name]
|
|
2949
|
-
search_key_names_by_type = {
|
|
2950
|
-
key_type: key_names for key_type, key_names in search_key_names_by_type.items() if len(key_names) > 1
|
|
2951
|
-
}
|
|
2952
|
-
if len(search_key_names_by_type) == 0:
|
|
2953
|
-
return df, []
|
|
2954
|
-
|
|
2955
|
-
multiple_keys_columns = [col for cols in search_key_names_by_type.values() for col in cols]
|
|
2956
|
-
other_columns = [col for col in df.columns if col not in multiple_keys_columns]
|
|
2957
|
-
exploded_dfs = []
|
|
2958
|
-
unnest_search_keys = []
|
|
2959
|
-
|
|
2960
|
-
for key_type, key_names in search_key_names_by_type.items():
|
|
2961
|
-
new_search_key = f"upgini_{key_type.name.lower()}_unnest"
|
|
2962
|
-
exploded_df = pd.melt(
|
|
2963
|
-
df, id_vars=other_columns, value_vars=key_names, var_name=SEARCH_KEY_UNNEST, value_name=new_search_key
|
|
2964
|
-
)
|
|
2965
|
-
exploded_dfs.append(exploded_df)
|
|
2966
|
-
for old_key in key_names:
|
|
2967
|
-
del search_keys[old_key]
|
|
2968
|
-
search_keys[new_search_key] = key_type
|
|
2969
|
-
unnest_search_keys.append(new_search_key)
|
|
2970
|
-
|
|
2971
|
-
df = pd.concat(exploded_dfs, ignore_index=True)
|
|
2972
|
-
return df, unnest_search_keys
|
|
2973
|
-
|
|
2974
2924
|
def __add_fit_system_record_id(
|
|
2975
|
-
self,
|
|
2976
|
-
df: pd.DataFrame,
|
|
2977
|
-
meaning_types: Dict[str, FileColumnMeaningType],
|
|
2978
|
-
search_keys: Dict[str, SearchKey],
|
|
2979
|
-
id_name: str,
|
|
2925
|
+
self, df: pd.DataFrame, meaning_types: Dict[str, FileColumnMeaningType], search_keys: Dict[str, SearchKey]
|
|
2980
2926
|
) -> pd.DataFrame:
|
|
2981
2927
|
# save original order or rows
|
|
2982
2928
|
original_index_name = df.index.name
|
|
@@ -3025,23 +2971,19 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3025
2971
|
|
|
3026
2972
|
df = df.reset_index(drop=True).reset_index()
|
|
3027
2973
|
# system_record_id saves correct order for fit
|
|
3028
|
-
df = df.rename(columns={DEFAULT_INDEX:
|
|
2974
|
+
df = df.rename(columns={DEFAULT_INDEX: SYSTEM_RECORD_ID})
|
|
3029
2975
|
|
|
3030
2976
|
# return original order
|
|
3031
2977
|
df = df.set_index(ORIGINAL_INDEX)
|
|
3032
2978
|
df.index.name = original_index_name
|
|
3033
2979
|
df = df.sort_values(by=original_order_name).drop(columns=original_order_name)
|
|
3034
2980
|
|
|
3035
|
-
meaning_types[
|
|
3036
|
-
FileColumnMeaningType.SYSTEM_RECORD_ID
|
|
3037
|
-
if id_name == SYSTEM_RECORD_ID
|
|
3038
|
-
else FileColumnMeaningType.ENTITY_SYSTEM_RECORD_ID
|
|
3039
|
-
)
|
|
2981
|
+
meaning_types[SYSTEM_RECORD_ID] = FileColumnMeaningType.SYSTEM_RECORD_ID
|
|
3040
2982
|
return df
|
|
3041
2983
|
|
|
3042
2984
|
def __correct_target(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
3043
2985
|
target = df[self.TARGET_NAME]
|
|
3044
|
-
if is_string_dtype(target):
|
|
2986
|
+
if is_string_dtype(target) or is_object_dtype(target):
|
|
3045
2987
|
maybe_numeric_target = pd.to_numeric(target, errors="coerce")
|
|
3046
2988
|
# If less than 5% is non numeric then leave this rows with NaN target and later it will be dropped
|
|
3047
2989
|
if maybe_numeric_target.isna().sum() <= _num_samples(df) * 0.05:
|
|
@@ -3091,10 +3033,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3091
3033
|
)
|
|
3092
3034
|
|
|
3093
3035
|
comparing_columns = X.columns if is_transform else df_with_original_index.columns
|
|
3094
|
-
dup_features = [
|
|
3095
|
-
c for c in comparing_columns
|
|
3096
|
-
if c in result_features.columns and c not in [SYSTEM_RECORD_ID, ENTITY_SYSTEM_RECORD_ID]
|
|
3097
|
-
]
|
|
3036
|
+
dup_features = [c for c in comparing_columns if c in result_features.columns and c != SYSTEM_RECORD_ID]
|
|
3098
3037
|
if len(dup_features) > 0:
|
|
3099
3038
|
self.logger.warning(f"X contain columns with same name as returned from backend: {dup_features}")
|
|
3100
3039
|
raise ValidationError(self.bundle.get("returned_features_same_as_passed").format(dup_features))
|
|
@@ -3105,7 +3044,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3105
3044
|
result_features = pd.merge(
|
|
3106
3045
|
df_with_original_index,
|
|
3107
3046
|
result_features,
|
|
3108
|
-
|
|
3047
|
+
left_on=SYSTEM_RECORD_ID,
|
|
3048
|
+
right_on=SYSTEM_RECORD_ID,
|
|
3109
3049
|
how="left" if is_transform else "inner",
|
|
3110
3050
|
)
|
|
3111
3051
|
result_features = result_features.set_index(original_index_name or DEFAULT_INDEX)
|
|
@@ -3316,6 +3256,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3316
3256
|
descriptions = []
|
|
3317
3257
|
for m in autofe_meta:
|
|
3318
3258
|
autofe_feature = Feature.from_formula(m.formula)
|
|
3259
|
+
orig_to_hashed = {base_column.original_name: base_column.hashed_name for base_column in m.base_columns}
|
|
3260
|
+
autofe_feature.rename_columns(orig_to_hashed)
|
|
3319
3261
|
autofe_feature.set_display_index(m.display_index)
|
|
3320
3262
|
if autofe_feature.op.is_vector:
|
|
3321
3263
|
continue
|
|
@@ -3443,7 +3385,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3443
3385
|
valid_search_keys[column_name] = SearchKey.CUSTOM_KEY
|
|
3444
3386
|
else:
|
|
3445
3387
|
if x[column_name].isnull().all() or (
|
|
3446
|
-
is_string_dtype(x[column_name])
|
|
3388
|
+
(is_string_dtype(x[column_name]) or is_object_dtype(x[column_name]))
|
|
3389
|
+
and (x[column_name].astype("string").str.strip() == "").all()
|
|
3447
3390
|
):
|
|
3448
3391
|
raise ValidationError(self.bundle.get("empty_search_key").format(column_name))
|
|
3449
3392
|
|
|
@@ -3485,13 +3428,13 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3485
3428
|
self.warning_counter.increment()
|
|
3486
3429
|
|
|
3487
3430
|
if len(valid_search_keys) == 1:
|
|
3488
|
-
|
|
3489
|
-
|
|
3490
|
-
|
|
3491
|
-
|
|
3492
|
-
|
|
3493
|
-
|
|
3494
|
-
|
|
3431
|
+
for k, v in valid_search_keys.items():
|
|
3432
|
+
# Show warning for country only if country is the only key
|
|
3433
|
+
if x[k].nunique() == 1 and (v != SearchKey.COUNTRY or len(valid_search_keys) == 1):
|
|
3434
|
+
msg = self.bundle.get("single_constant_search_key").format(v, x[k].values[0])
|
|
3435
|
+
print(msg)
|
|
3436
|
+
self.logger.warning(msg)
|
|
3437
|
+
self.warning_counter.increment()
|
|
3495
3438
|
|
|
3496
3439
|
self.logger.info(f"Prepared search keys: {valid_search_keys}")
|
|
3497
3440
|
|
|
@@ -3601,68 +3544,61 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3601
3544
|
def check_need_detect(search_key: SearchKey):
|
|
3602
3545
|
return not is_transform or search_key in self.fit_search_keys.values()
|
|
3603
3546
|
|
|
3604
|
-
|
|
3605
|
-
|
|
3606
|
-
|
|
3607
|
-
|
|
3608
|
-
|
|
3609
|
-
|
|
3610
|
-
self.autodetected_search_keys.update(new_keys)
|
|
3611
|
-
self.logger.info(f"Autodetected search key POSTAL_CODE in column {maybe_keys}")
|
|
3547
|
+
if SearchKey.POSTAL_CODE not in search_keys.values() and check_need_detect(SearchKey.POSTAL_CODE):
|
|
3548
|
+
maybe_key = PostalCodeSearchKeyDetector().get_search_key_column(sample)
|
|
3549
|
+
if maybe_key is not None:
|
|
3550
|
+
search_keys[maybe_key] = SearchKey.POSTAL_CODE
|
|
3551
|
+
self.autodetected_search_keys[maybe_key] = SearchKey.POSTAL_CODE
|
|
3552
|
+
self.logger.info(f"Autodetected search key POSTAL_CODE in column {maybe_key}")
|
|
3612
3553
|
if not silent_mode:
|
|
3613
|
-
print(self.bundle.get("postal_code_detected").format(
|
|
3554
|
+
print(self.bundle.get("postal_code_detected").format(maybe_key))
|
|
3614
3555
|
|
|
3615
3556
|
if (
|
|
3616
3557
|
SearchKey.COUNTRY not in search_keys.values()
|
|
3617
3558
|
and self.country_code is None
|
|
3618
3559
|
and check_need_detect(SearchKey.COUNTRY)
|
|
3619
3560
|
):
|
|
3620
|
-
maybe_key = CountrySearchKeyDetector().
|
|
3621
|
-
if maybe_key:
|
|
3622
|
-
search_keys[maybe_key
|
|
3623
|
-
self.autodetected_search_keys[maybe_key
|
|
3561
|
+
maybe_key = CountrySearchKeyDetector().get_search_key_column(sample)
|
|
3562
|
+
if maybe_key is not None:
|
|
3563
|
+
search_keys[maybe_key] = SearchKey.COUNTRY
|
|
3564
|
+
self.autodetected_search_keys[maybe_key] = SearchKey.COUNTRY
|
|
3624
3565
|
self.logger.info(f"Autodetected search key COUNTRY in column {maybe_key}")
|
|
3625
3566
|
if not silent_mode:
|
|
3626
3567
|
print(self.bundle.get("country_detected").format(maybe_key))
|
|
3627
3568
|
|
|
3628
3569
|
if (
|
|
3629
|
-
|
|
3630
|
-
SearchKey.HEM not in search_keys.values()
|
|
3570
|
+
SearchKey.EMAIL not in search_keys.values()
|
|
3571
|
+
and SearchKey.HEM not in search_keys.values()
|
|
3631
3572
|
and check_need_detect(SearchKey.HEM)
|
|
3632
3573
|
):
|
|
3633
|
-
|
|
3634
|
-
if
|
|
3574
|
+
maybe_key = EmailSearchKeyDetector().get_search_key_column(sample)
|
|
3575
|
+
if maybe_key is not None and maybe_key not in search_keys.keys():
|
|
3635
3576
|
if self.__is_registered or is_demo_dataset:
|
|
3636
|
-
|
|
3637
|
-
|
|
3638
|
-
self.
|
|
3639
|
-
self.logger.info(f"Autodetected search key EMAIL in column {maybe_keys}")
|
|
3577
|
+
search_keys[maybe_key] = SearchKey.EMAIL
|
|
3578
|
+
self.autodetected_search_keys[maybe_key] = SearchKey.EMAIL
|
|
3579
|
+
self.logger.info(f"Autodetected search key EMAIL in column {maybe_key}")
|
|
3640
3580
|
if not silent_mode:
|
|
3641
|
-
print(self.bundle.get("email_detected").format(
|
|
3581
|
+
print(self.bundle.get("email_detected").format(maybe_key))
|
|
3642
3582
|
else:
|
|
3643
3583
|
self.logger.warning(
|
|
3644
|
-
f"Autodetected search key EMAIL in column {
|
|
3645
|
-
" But not used because not registered user"
|
|
3584
|
+
f"Autodetected search key EMAIL in column {maybe_key}. But not used because not registered user"
|
|
3646
3585
|
)
|
|
3647
3586
|
if not silent_mode:
|
|
3648
|
-
print(self.bundle.get("email_detected_not_registered").format(
|
|
3587
|
+
print(self.bundle.get("email_detected_not_registered").format(maybe_key))
|
|
3649
3588
|
self.warning_counter.increment()
|
|
3650
3589
|
|
|
3651
|
-
|
|
3652
|
-
|
|
3653
|
-
|
|
3654
|
-
if maybe_keys:
|
|
3590
|
+
if SearchKey.PHONE not in search_keys.values() and check_need_detect(SearchKey.PHONE):
|
|
3591
|
+
maybe_key = PhoneSearchKeyDetector().get_search_key_column(sample)
|
|
3592
|
+
if maybe_key is not None and maybe_key not in search_keys.keys():
|
|
3655
3593
|
if self.__is_registered or is_demo_dataset:
|
|
3656
|
-
|
|
3657
|
-
|
|
3658
|
-
self.
|
|
3659
|
-
self.logger.info(f"Autodetected search key PHONE in column {maybe_keys}")
|
|
3594
|
+
search_keys[maybe_key] = SearchKey.PHONE
|
|
3595
|
+
self.autodetected_search_keys[maybe_key] = SearchKey.PHONE
|
|
3596
|
+
self.logger.info(f"Autodetected search key PHONE in column {maybe_key}")
|
|
3660
3597
|
if not silent_mode:
|
|
3661
|
-
print(self.bundle.get("phone_detected").format(
|
|
3598
|
+
print(self.bundle.get("phone_detected").format(maybe_key))
|
|
3662
3599
|
else:
|
|
3663
3600
|
self.logger.warning(
|
|
3664
|
-
f"Autodetected search key PHONE in column {
|
|
3665
|
-
"But not used because not registered user"
|
|
3601
|
+
f"Autodetected search key PHONE in column {maybe_key}. But not used because not registered user"
|
|
3666
3602
|
)
|
|
3667
3603
|
if not silent_mode:
|
|
3668
3604
|
print(self.bundle.get("phone_detected_not_registered"))
|