upgini 1.1.298__tar.gz → 1.1.299__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {upgini-1.1.298 → upgini-1.1.299}/PKG-INFO +1 -1
- upgini-1.1.299/src/upgini/__about__.py +1 -0
- {upgini-1.1.298 → upgini-1.1.299}/src/upgini/dataset.py +11 -2
- {upgini-1.1.298 → upgini-1.1.299}/src/upgini/features_enricher.py +222 -101
- {upgini-1.1.298 → upgini-1.1.299}/src/upgini/metadata.py +10 -2
- {upgini-1.1.298 → upgini-1.1.299}/src/upgini/metrics.py +1 -1
- {upgini-1.1.298 → upgini-1.1.299}/src/upgini/resource_bundle/strings.properties +1 -0
- upgini-1.1.299/src/upgini/utils/base_search_key_detector.py +27 -0
- {upgini-1.1.298 → upgini-1.1.299}/src/upgini/utils/deduplicate_utils.py +11 -1
- {upgini-1.1.298 → upgini-1.1.299}/src/upgini/utils/email_utils.py +5 -0
- upgini-1.1.298/src/upgini/__about__.py +0 -1
- upgini-1.1.298/src/upgini/utils/base_search_key_detector.py +0 -25
- {upgini-1.1.298 → upgini-1.1.299}/.gitignore +0 -0
- {upgini-1.1.298 → upgini-1.1.299}/LICENSE +0 -0
- {upgini-1.1.298 → upgini-1.1.299}/README.md +0 -0
- {upgini-1.1.298 → upgini-1.1.299}/pyproject.toml +0 -0
- {upgini-1.1.298 → upgini-1.1.299}/src/upgini/__init__.py +0 -0
- {upgini-1.1.298 → upgini-1.1.299}/src/upgini/ads.py +0 -0
- {upgini-1.1.298 → upgini-1.1.299}/src/upgini/ads_management/__init__.py +0 -0
- {upgini-1.1.298 → upgini-1.1.299}/src/upgini/ads_management/ads_manager.py +0 -0
- {upgini-1.1.298 → upgini-1.1.299}/src/upgini/autofe/__init__.py +0 -0
- {upgini-1.1.298 → upgini-1.1.299}/src/upgini/autofe/all_operands.py +0 -0
- {upgini-1.1.298 → upgini-1.1.299}/src/upgini/autofe/binary.py +0 -0
- {upgini-1.1.298 → upgini-1.1.299}/src/upgini/autofe/date.py +0 -0
- {upgini-1.1.298 → upgini-1.1.299}/src/upgini/autofe/feature.py +0 -0
- {upgini-1.1.298 → upgini-1.1.299}/src/upgini/autofe/groupby.py +0 -0
- {upgini-1.1.298 → upgini-1.1.299}/src/upgini/autofe/operand.py +0 -0
- {upgini-1.1.298 → upgini-1.1.299}/src/upgini/autofe/unary.py +0 -0
- {upgini-1.1.298 → upgini-1.1.299}/src/upgini/autofe/vector.py +0 -0
- {upgini-1.1.298 → upgini-1.1.299}/src/upgini/data_source/__init__.py +0 -0
- {upgini-1.1.298 → upgini-1.1.299}/src/upgini/data_source/data_source_publisher.py +0 -0
- {upgini-1.1.298 → upgini-1.1.299}/src/upgini/errors.py +0 -0
- {upgini-1.1.298 → upgini-1.1.299}/src/upgini/http.py +0 -0
- {upgini-1.1.298 → upgini-1.1.299}/src/upgini/lazy_import.py +0 -0
- {upgini-1.1.298 → upgini-1.1.299}/src/upgini/mdc/__init__.py +0 -0
- {upgini-1.1.298 → upgini-1.1.299}/src/upgini/mdc/context.py +0 -0
- {upgini-1.1.298 → upgini-1.1.299}/src/upgini/normalizer/__init__.py +0 -0
- {upgini-1.1.298 → upgini-1.1.299}/src/upgini/normalizer/phone_normalizer.py +0 -0
- {upgini-1.1.298 → upgini-1.1.299}/src/upgini/resource_bundle/__init__.py +0 -0
- {upgini-1.1.298 → upgini-1.1.299}/src/upgini/resource_bundle/exceptions.py +0 -0
- {upgini-1.1.298 → upgini-1.1.299}/src/upgini/resource_bundle/strings_widget.properties +0 -0
- {upgini-1.1.298 → upgini-1.1.299}/src/upgini/sampler/__init__.py +0 -0
- {upgini-1.1.298 → upgini-1.1.299}/src/upgini/sampler/base.py +0 -0
- {upgini-1.1.298 → upgini-1.1.299}/src/upgini/sampler/random_under_sampler.py +0 -0
- {upgini-1.1.298 → upgini-1.1.299}/src/upgini/sampler/utils.py +0 -0
- {upgini-1.1.298 → upgini-1.1.299}/src/upgini/search_task.py +0 -0
- {upgini-1.1.298 → upgini-1.1.299}/src/upgini/spinner.py +0 -0
- {upgini-1.1.298 → upgini-1.1.299}/src/upgini/utils/__init__.py +0 -0
- {upgini-1.1.298 → upgini-1.1.299}/src/upgini/utils/blocked_time_series.py +0 -0
- {upgini-1.1.298 → upgini-1.1.299}/src/upgini/utils/country_utils.py +0 -0
- {upgini-1.1.298 → upgini-1.1.299}/src/upgini/utils/custom_loss_utils.py +0 -0
- {upgini-1.1.298 → upgini-1.1.299}/src/upgini/utils/cv_utils.py +0 -0
- {upgini-1.1.298 → upgini-1.1.299}/src/upgini/utils/datetime_utils.py +0 -0
- {upgini-1.1.298 → upgini-1.1.299}/src/upgini/utils/display_utils.py +0 -0
- {upgini-1.1.298 → upgini-1.1.299}/src/upgini/utils/fallback_progress_bar.py +0 -0
- {upgini-1.1.298 → upgini-1.1.299}/src/upgini/utils/features_validator.py +0 -0
- {upgini-1.1.298 → upgini-1.1.299}/src/upgini/utils/format.py +0 -0
- {upgini-1.1.298 → upgini-1.1.299}/src/upgini/utils/ip_utils.py +0 -0
- {upgini-1.1.298 → upgini-1.1.299}/src/upgini/utils/phone_utils.py +0 -0
- {upgini-1.1.298 → upgini-1.1.299}/src/upgini/utils/postal_code_utils.py +0 -0
- {upgini-1.1.298 → upgini-1.1.299}/src/upgini/utils/progress_bar.py +0 -0
- {upgini-1.1.298 → upgini-1.1.299}/src/upgini/utils/sklearn_ext.py +0 -0
- {upgini-1.1.298 → upgini-1.1.299}/src/upgini/utils/target_utils.py +0 -0
- {upgini-1.1.298 → upgini-1.1.299}/src/upgini/utils/track_info.py +0 -0
- {upgini-1.1.298 → upgini-1.1.299}/src/upgini/utils/warning_counter.py +0 -0
- {upgini-1.1.298 → upgini-1.1.299}/src/upgini/version_validator.py +0 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "1.1.299"
|
|
@@ -23,7 +23,9 @@ from pandas.api.types import (
|
|
|
23
23
|
from upgini.errors import ValidationError
|
|
24
24
|
from upgini.http import ProgressStage, SearchProgress, _RestClient
|
|
25
25
|
from upgini.metadata import (
|
|
26
|
+
ENTITY_SYSTEM_RECORD_ID,
|
|
26
27
|
EVAL_SET_INDEX,
|
|
28
|
+
SEARCH_KEY_UNNEST,
|
|
27
29
|
SYSTEM_COLUMNS,
|
|
28
30
|
SYSTEM_RECORD_ID,
|
|
29
31
|
TARGET,
|
|
@@ -79,6 +81,7 @@ class Dataset: # (pd.DataFrame):
|
|
|
79
81
|
path: Optional[str] = None,
|
|
80
82
|
meaning_types: Optional[Dict[str, FileColumnMeaningType]] = None,
|
|
81
83
|
search_keys: Optional[List[Tuple[str, ...]]] = None,
|
|
84
|
+
unnest_search_keys: Optional[Dict[str, str]] = None,
|
|
82
85
|
model_task_type: Optional[ModelTaskType] = None,
|
|
83
86
|
random_state: Optional[int] = None,
|
|
84
87
|
rest_client: Optional[_RestClient] = None,
|
|
@@ -113,6 +116,7 @@ class Dataset: # (pd.DataFrame):
|
|
|
113
116
|
self.description = description
|
|
114
117
|
self.meaning_types = meaning_types
|
|
115
118
|
self.search_keys = search_keys
|
|
119
|
+
self.unnest_search_keys = unnest_search_keys
|
|
116
120
|
self.ignore_columns = []
|
|
117
121
|
self.hierarchical_group_keys = []
|
|
118
122
|
self.hierarchical_subgroup_keys = []
|
|
@@ -172,7 +176,7 @@ class Dataset: # (pd.DataFrame):
|
|
|
172
176
|
new_columns = []
|
|
173
177
|
dup_counter = 0
|
|
174
178
|
for column in self.data.columns:
|
|
175
|
-
if column in [TARGET, EVAL_SET_INDEX, SYSTEM_RECORD_ID]:
|
|
179
|
+
if column in [TARGET, EVAL_SET_INDEX, SYSTEM_RECORD_ID, ENTITY_SYSTEM_RECORD_ID, SEARCH_KEY_UNNEST]:
|
|
176
180
|
self.columns_renaming[column] = column
|
|
177
181
|
new_columns.append(column)
|
|
178
182
|
continue
|
|
@@ -353,7 +357,9 @@ class Dataset: # (pd.DataFrame):
|
|
|
353
357
|
|
|
354
358
|
if is_string_dtype(self.data[postal_code]) or is_object_dtype(self.data[postal_code]):
|
|
355
359
|
try:
|
|
356
|
-
self.data[postal_code] =
|
|
360
|
+
self.data[postal_code] = (
|
|
361
|
+
self.data[postal_code].astype("string").astype("Float64").astype("Int64").astype("string")
|
|
362
|
+
)
|
|
357
363
|
except Exception:
|
|
358
364
|
pass
|
|
359
365
|
elif is_float_dtype(self.data[postal_code]):
|
|
@@ -803,6 +809,9 @@ class Dataset: # (pd.DataFrame):
|
|
|
803
809
|
meaningType=meaning_type,
|
|
804
810
|
minMaxValues=min_max_values,
|
|
805
811
|
)
|
|
812
|
+
if self.unnest_search_keys and column_meta.originalName in self.unnest_search_keys:
|
|
813
|
+
column_meta.isUnnest = True
|
|
814
|
+
column_meta.unnestKeyNames = self.unnest_search_keys[column_meta.originalName]
|
|
806
815
|
|
|
807
816
|
columns.append(column_meta)
|
|
808
817
|
|
|
@@ -11,6 +11,7 @@ import sys
|
|
|
11
11
|
import tempfile
|
|
12
12
|
import time
|
|
13
13
|
import uuid
|
|
14
|
+
from collections import Counter
|
|
14
15
|
from dataclasses import dataclass
|
|
15
16
|
from threading import Thread
|
|
16
17
|
from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union
|
|
@@ -45,9 +46,11 @@ from upgini.mdc import MDC
|
|
|
45
46
|
from upgini.metadata import (
|
|
46
47
|
COUNTRY,
|
|
47
48
|
DEFAULT_INDEX,
|
|
49
|
+
ENTITY_SYSTEM_RECORD_ID,
|
|
48
50
|
EVAL_SET_INDEX,
|
|
49
51
|
ORIGINAL_INDEX,
|
|
50
52
|
RENAMED_INDEX,
|
|
53
|
+
SEARCH_KEY_UNNEST,
|
|
51
54
|
SORT_ID,
|
|
52
55
|
SYSTEM_RECORD_ID,
|
|
53
56
|
TARGET,
|
|
@@ -248,7 +251,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
248
251
|
self.__cached_sampled_datasets: Optional[Tuple[pd.DataFrame, pd.DataFrame, pd.Series, Dict, Dict]] = None
|
|
249
252
|
|
|
250
253
|
validate_version(self.logger)
|
|
251
|
-
self.search_keys = search_keys or
|
|
254
|
+
self.search_keys = search_keys or {}
|
|
252
255
|
self.country_code = country_code
|
|
253
256
|
self.__validate_search_keys(search_keys, search_id)
|
|
254
257
|
self.model_task_type = model_task_type
|
|
@@ -1200,7 +1203,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1200
1203
|
email_column = self._get_email_column(search_keys)
|
|
1201
1204
|
hem_column = self._get_hem_column(search_keys)
|
|
1202
1205
|
if email_column:
|
|
1203
|
-
converter = EmailSearchKeyConverter(email_column, hem_column, search_keys, self.logger)
|
|
1206
|
+
converter = EmailSearchKeyConverter(email_column, hem_column, search_keys, [], self.logger)
|
|
1204
1207
|
extended_X = converter.convert(extended_X)
|
|
1205
1208
|
generated_features.extend(converter.generated_features)
|
|
1206
1209
|
if (
|
|
@@ -1353,7 +1356,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1353
1356
|
not in (
|
|
1354
1357
|
excluding_search_keys
|
|
1355
1358
|
+ list(self.fit_dropped_features)
|
|
1356
|
-
+ [DateTimeSearchKeyConverter.DATETIME_COL, SYSTEM_RECORD_ID]
|
|
1359
|
+
+ [DateTimeSearchKeyConverter.DATETIME_COL, SYSTEM_RECORD_ID, ENTITY_SYSTEM_RECORD_ID]
|
|
1357
1360
|
)
|
|
1358
1361
|
]
|
|
1359
1362
|
|
|
@@ -1417,7 +1420,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1417
1420
|
fitting_enriched_X[col].astype("string").str.replace(",", ".", regex=False).astype(np.float64)
|
|
1418
1421
|
)
|
|
1419
1422
|
|
|
1420
|
-
fitting_eval_set_dict =
|
|
1423
|
+
fitting_eval_set_dict = {}
|
|
1421
1424
|
for idx, eval_tuple in eval_set_sampled_dict.items():
|
|
1422
1425
|
eval_X_sampled, enriched_eval_X, eval_y_sampled = eval_tuple
|
|
1423
1426
|
eval_X_sorted, eval_y_sorted = self._sort_by_system_record_id(eval_X_sampled, eval_y_sampled, self.cv)
|
|
@@ -1534,7 +1537,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1534
1537
|
def __sample_only_input(
|
|
1535
1538
|
self, validated_X: pd.DataFrame, validated_y: pd.Series, eval_set: Optional[List[tuple]], is_demo_dataset: bool
|
|
1536
1539
|
) -> _SampledDataForMetrics:
|
|
1537
|
-
eval_set_sampled_dict =
|
|
1540
|
+
eval_set_sampled_dict = {}
|
|
1538
1541
|
|
|
1539
1542
|
df = validated_X.copy()
|
|
1540
1543
|
df[TARGET] = validated_y
|
|
@@ -1560,7 +1563,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1560
1563
|
df = df.sample(n=sample_rows, random_state=self.random_state)
|
|
1561
1564
|
|
|
1562
1565
|
df_extended, search_keys = self._extend_x(df, is_demo_dataset)
|
|
1563
|
-
df_extended = self.__add_fit_system_record_id(df_extended,
|
|
1566
|
+
df_extended = self.__add_fit_system_record_id(df_extended, {}, search_keys, SYSTEM_RECORD_ID)
|
|
1564
1567
|
|
|
1565
1568
|
train_df = df_extended.query(f"{EVAL_SET_INDEX} == 0") if eval_set is not None else df_extended
|
|
1566
1569
|
X_sampled = train_df.drop(columns=[TARGET, EVAL_SET_INDEX], errors="ignore")
|
|
@@ -1584,7 +1587,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1584
1587
|
trace_id: str,
|
|
1585
1588
|
remove_outliers_calc_metrics: Optional[bool],
|
|
1586
1589
|
) -> _SampledDataForMetrics:
|
|
1587
|
-
eval_set_sampled_dict =
|
|
1590
|
+
eval_set_sampled_dict = {}
|
|
1588
1591
|
search_keys = self.fit_search_keys
|
|
1589
1592
|
|
|
1590
1593
|
rows_to_drop = None
|
|
@@ -1658,7 +1661,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1658
1661
|
progress_bar: Optional[ProgressBar],
|
|
1659
1662
|
progress_callback: Optional[Callable[[SearchProgress], Any]],
|
|
1660
1663
|
) -> _SampledDataForMetrics:
|
|
1661
|
-
eval_set_sampled_dict =
|
|
1664
|
+
eval_set_sampled_dict = {}
|
|
1662
1665
|
if eval_set is not None:
|
|
1663
1666
|
self.logger.info("Transform with eval_set")
|
|
1664
1667
|
# concatenate X and eval_set with eval_set_index
|
|
@@ -1680,7 +1683,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1680
1683
|
self.logger.info(f"Downsampling from {num_samples} to {Dataset.FIT_SAMPLE_WITH_EVAL_SET_ROWS}")
|
|
1681
1684
|
df = df.sample(n=Dataset.FIT_SAMPLE_WITH_EVAL_SET_ROWS, random_state=self.random_state)
|
|
1682
1685
|
|
|
1683
|
-
eval_set_sampled_dict =
|
|
1686
|
+
eval_set_sampled_dict = {}
|
|
1684
1687
|
|
|
1685
1688
|
tmp_target_name = "__target"
|
|
1686
1689
|
df = df.rename(columns={TARGET: tmp_target_name})
|
|
@@ -1943,11 +1946,38 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1943
1946
|
self.logger.info("Input dataset hasn't date column")
|
|
1944
1947
|
if self.add_date_if_missing:
|
|
1945
1948
|
df = self._add_current_date_as_key(df, search_keys, self.logger, self.bundle)
|
|
1949
|
+
|
|
1950
|
+
# Don't pass all features in backend on transform
|
|
1951
|
+
original_features_for_transform = []
|
|
1952
|
+
runtime_parameters = self._get_copy_of_runtime_parameters()
|
|
1953
|
+
features_not_to_pass = [column for column in df.columns if column not in search_keys.keys()]
|
|
1954
|
+
if len(features_not_to_pass) > 0:
|
|
1955
|
+
# Pass only features that need for transform
|
|
1956
|
+
features_for_transform = self._search_task.get_features_for_transform()
|
|
1957
|
+
if features_for_transform is not None and len(features_for_transform) > 0:
|
|
1958
|
+
file_metadata = self._search_task.get_file_metadata(trace_id)
|
|
1959
|
+
original_features_for_transform = [
|
|
1960
|
+
c.originalName or c.name for c in file_metadata.columns if c.name in features_for_transform
|
|
1961
|
+
]
|
|
1962
|
+
|
|
1963
|
+
runtime_parameters.properties["features_for_embeddings"] = ",".join(features_for_transform)
|
|
1964
|
+
|
|
1965
|
+
columns_for_system_record_id = sorted(list(search_keys.keys()) + (original_features_for_transform))
|
|
1966
|
+
|
|
1967
|
+
df[ENTITY_SYSTEM_RECORD_ID] = pd.util.hash_pandas_object(
|
|
1968
|
+
df[columns_for_system_record_id], index=False
|
|
1969
|
+
).astype("Float64")
|
|
1970
|
+
|
|
1971
|
+
# Explode multiple search keys
|
|
1972
|
+
df, unnest_search_keys = self._explode_multiple_search_keys(df, search_keys)
|
|
1973
|
+
|
|
1946
1974
|
email_column = self._get_email_column(search_keys)
|
|
1947
1975
|
hem_column = self._get_hem_column(search_keys)
|
|
1948
1976
|
email_converted_to_hem = False
|
|
1949
1977
|
if email_column:
|
|
1950
|
-
converter = EmailSearchKeyConverter(
|
|
1978
|
+
converter = EmailSearchKeyConverter(
|
|
1979
|
+
email_column, hem_column, search_keys, list(unnest_search_keys.keys()), self.logger
|
|
1980
|
+
)
|
|
1951
1981
|
df = converter.convert(df)
|
|
1952
1982
|
generated_features.extend(converter.generated_features)
|
|
1953
1983
|
email_converted_to_hem = converter.email_converted_to_hem
|
|
@@ -1961,30 +1991,21 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1961
1991
|
generated_features = [f for f in generated_features if f in self.fit_generated_features]
|
|
1962
1992
|
|
|
1963
1993
|
meaning_types = {col: key.value for col, key in search_keys.items()}
|
|
1964
|
-
non_keys_columns = [column for column in df.columns if column not in search_keys.keys()]
|
|
1994
|
+
# non_keys_columns = [column for column in df.columns if column not in search_keys.keys()]
|
|
1995
|
+
for col in original_features_for_transform:
|
|
1996
|
+
meaning_types[col] = FileColumnMeaningType.FEATURE
|
|
1997
|
+
features_not_to_pass = [column for column in features_not_to_pass if column not in search_keys.keys()]
|
|
1965
1998
|
|
|
1966
1999
|
if email_converted_to_hem:
|
|
1967
|
-
|
|
2000
|
+
features_not_to_pass.append(email_column)
|
|
1968
2001
|
|
|
1969
|
-
|
|
1970
|
-
|
|
1971
|
-
runtime_parameters = self._get_copy_of_runtime_parameters()
|
|
1972
|
-
if len(non_keys_columns) > 0:
|
|
1973
|
-
# Pass only features that need for transform
|
|
1974
|
-
features_for_transform = self._search_task.get_features_for_transform()
|
|
1975
|
-
if features_for_transform is not None and len(features_for_transform) > 0:
|
|
1976
|
-
file_metadata = self._search_task.get_file_metadata(trace_id)
|
|
1977
|
-
original_features_for_transform = [
|
|
1978
|
-
c.originalName or c.name for c in file_metadata.columns if c.name in features_for_transform
|
|
1979
|
-
]
|
|
1980
|
-
non_keys_columns = [c for c in non_keys_columns if c not in original_features_for_transform]
|
|
1981
|
-
|
|
1982
|
-
runtime_parameters.properties["features_for_embeddings"] = ",".join(features_for_transform)
|
|
2002
|
+
features_not_to_pass = [c for c in features_not_to_pass if c not in original_features_for_transform]
|
|
2003
|
+
columns_for_system_record_id = sorted(list(search_keys.keys()) + (original_features_for_transform))
|
|
1983
2004
|
|
|
1984
2005
|
if add_fit_system_record_id:
|
|
1985
|
-
df = self.__add_fit_system_record_id(df,
|
|
2006
|
+
df = self.__add_fit_system_record_id(df, {}, search_keys, SYSTEM_RECORD_ID)
|
|
1986
2007
|
df = df.rename(columns={SYSTEM_RECORD_ID: SORT_ID})
|
|
1987
|
-
|
|
2008
|
+
features_not_to_pass.append(SORT_ID)
|
|
1988
2009
|
|
|
1989
2010
|
columns_for_system_record_id = sorted(list(search_keys.keys()) + (original_features_for_transform or []))
|
|
1990
2011
|
|
|
@@ -1992,16 +2013,19 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1992
2013
|
"Float64"
|
|
1993
2014
|
)
|
|
1994
2015
|
meaning_types[SYSTEM_RECORD_ID] = FileColumnMeaningType.SYSTEM_RECORD_ID
|
|
2016
|
+
meaning_types[ENTITY_SYSTEM_RECORD_ID] = FileColumnMeaningType.ENTITY_SYSTEM_RECORD_ID
|
|
2017
|
+
if SEARCH_KEY_UNNEST in df.columns:
|
|
2018
|
+
meaning_types[SEARCH_KEY_UNNEST] = FileColumnMeaningType.UNNEST_KEY
|
|
1995
2019
|
|
|
1996
2020
|
df = df.reset_index(drop=True)
|
|
1997
|
-
system_columns_with_original_index = [SYSTEM_RECORD_ID] + generated_features
|
|
2021
|
+
system_columns_with_original_index = [SYSTEM_RECORD_ID, ENTITY_SYSTEM_RECORD_ID] + generated_features
|
|
1998
2022
|
if add_fit_system_record_id:
|
|
1999
2023
|
system_columns_with_original_index.append(SORT_ID)
|
|
2000
2024
|
df_with_original_index = df[system_columns_with_original_index].copy()
|
|
2001
2025
|
|
|
2002
2026
|
combined_search_keys = combine_search_keys(search_keys.keys())
|
|
2003
2027
|
|
|
2004
|
-
df_without_features = df.drop(columns=
|
|
2028
|
+
df_without_features = df.drop(columns=features_not_to_pass)
|
|
2005
2029
|
|
|
2006
2030
|
df_without_features = clean_full_duplicates(
|
|
2007
2031
|
df_without_features, self.logger, silent=silent_mode, bundle=self.bundle
|
|
@@ -2013,12 +2037,13 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2013
2037
|
dataset = Dataset(
|
|
2014
2038
|
"sample_" + str(uuid.uuid4()),
|
|
2015
2039
|
df=df_without_features,
|
|
2040
|
+
meaning_types=meaning_types,
|
|
2041
|
+
search_keys=combined_search_keys,
|
|
2042
|
+
unnest_search_keys=unnest_search_keys,
|
|
2016
2043
|
date_format=self.date_format,
|
|
2017
2044
|
rest_client=self.rest_client,
|
|
2018
2045
|
logger=self.logger,
|
|
2019
2046
|
)
|
|
2020
|
-
dataset.meaning_types = meaning_types
|
|
2021
|
-
dataset.search_keys = combined_search_keys
|
|
2022
2047
|
if email_converted_to_hem:
|
|
2023
2048
|
dataset.ignore_columns = [email_column]
|
|
2024
2049
|
|
|
@@ -2157,6 +2182,14 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2157
2182
|
|
|
2158
2183
|
key_types = search_keys.values()
|
|
2159
2184
|
|
|
2185
|
+
# Multiple search keys allowed only for PHONE, IP, POSTAL_CODE, EMAIL, HEM
|
|
2186
|
+
multi_keys = [key for key, count in Counter(key_types).items() if count > 1]
|
|
2187
|
+
for multi_key in multi_keys:
|
|
2188
|
+
if multi_key not in [SearchKey.PHONE, SearchKey.IP, SearchKey.POSTAL_CODE, SearchKey.EMAIL, SearchKey.HEM]:
|
|
2189
|
+
msg = self.bundle.get("unsupported_multi_key").format(multi_key)
|
|
2190
|
+
self.logger.warning(msg)
|
|
2191
|
+
raise ValidationError(msg)
|
|
2192
|
+
|
|
2160
2193
|
if SearchKey.DATE in key_types and SearchKey.DATETIME in key_types:
|
|
2161
2194
|
msg = self.bundle.get("date_and_datetime_simultanious")
|
|
2162
2195
|
self.logger.warning(msg)
|
|
@@ -2172,11 +2205,11 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2172
2205
|
self.logger.warning(msg)
|
|
2173
2206
|
raise ValidationError(msg)
|
|
2174
2207
|
|
|
2175
|
-
for key_type in SearchKey.__members__.values():
|
|
2176
|
-
|
|
2177
|
-
|
|
2178
|
-
|
|
2179
|
-
|
|
2208
|
+
# for key_type in SearchKey.__members__.values():
|
|
2209
|
+
# if key_type != SearchKey.CUSTOM_KEY and list(key_types).count(key_type) > 1:
|
|
2210
|
+
# msg = self.bundle.get("multiple_search_key").format(key_type)
|
|
2211
|
+
# self.logger.warning(msg)
|
|
2212
|
+
# raise ValidationError(msg)
|
|
2180
2213
|
|
|
2181
2214
|
# non_personal_keys = set(SearchKey.__members__.values()) - set(SearchKey.personal_keys())
|
|
2182
2215
|
# if (
|
|
@@ -2314,14 +2347,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2314
2347
|
self.logger.info("Input dataset hasn't date column")
|
|
2315
2348
|
if self.add_date_if_missing:
|
|
2316
2349
|
df = self._add_current_date_as_key(df, self.fit_search_keys, self.logger, self.bundle)
|
|
2317
|
-
|
|
2318
|
-
hem_column = self._get_hem_column(self.fit_search_keys)
|
|
2319
|
-
email_converted_to_hem = False
|
|
2320
|
-
if email_column:
|
|
2321
|
-
converter = EmailSearchKeyConverter(email_column, hem_column, self.fit_search_keys, self.logger)
|
|
2322
|
-
df = converter.convert(df)
|
|
2323
|
-
self.fit_generated_features.extend(converter.generated_features)
|
|
2324
|
-
email_converted_to_hem = converter.email_converted_to_hem
|
|
2350
|
+
|
|
2325
2351
|
if (
|
|
2326
2352
|
self.detect_missing_search_keys
|
|
2327
2353
|
and list(self.fit_search_keys.values()) == [SearchKey.DATE]
|
|
@@ -2330,7 +2356,37 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2330
2356
|
converter = IpToCountrySearchKeyConverter(self.fit_search_keys, self.logger)
|
|
2331
2357
|
df = converter.convert(df)
|
|
2332
2358
|
|
|
2359
|
+
# Explode multiple search keys
|
|
2333
2360
|
non_feature_columns = [self.TARGET_NAME, EVAL_SET_INDEX] + list(self.fit_search_keys.keys())
|
|
2361
|
+
meaning_types = {
|
|
2362
|
+
**{col: key.value for col, key in self.fit_search_keys.items()},
|
|
2363
|
+
**{str(c): FileColumnMeaningType.FEATURE for c in df.columns if c not in non_feature_columns},
|
|
2364
|
+
}
|
|
2365
|
+
meaning_types[self.TARGET_NAME] = FileColumnMeaningType.TARGET
|
|
2366
|
+
if eval_set is not None and len(eval_set) > 0:
|
|
2367
|
+
meaning_types[EVAL_SET_INDEX] = FileColumnMeaningType.EVAL_SET_INDEX
|
|
2368
|
+
df = self.__add_fit_system_record_id(df, meaning_types, self.fit_search_keys, ENTITY_SYSTEM_RECORD_ID)
|
|
2369
|
+
|
|
2370
|
+
# TODO check that this is correct for enrichment
|
|
2371
|
+
self.df_with_original_index = df.copy()
|
|
2372
|
+
|
|
2373
|
+
df, unnest_search_keys = self._explode_multiple_search_keys(df, self.fit_search_keys)
|
|
2374
|
+
|
|
2375
|
+
# Convert EMAIL to HEM after unnesting to do it only with one column
|
|
2376
|
+
email_column = self._get_email_column(self.fit_search_keys)
|
|
2377
|
+
hem_column = self._get_hem_column(self.fit_search_keys)
|
|
2378
|
+
email_converted_to_hem = False
|
|
2379
|
+
if email_column:
|
|
2380
|
+
converter = EmailSearchKeyConverter(
|
|
2381
|
+
email_column, hem_column, self.fit_search_keys, list(unnest_search_keys.keys()), self.logger
|
|
2382
|
+
)
|
|
2383
|
+
df = converter.convert(df)
|
|
2384
|
+
self.fit_generated_features.extend(converter.generated_features)
|
|
2385
|
+
email_converted_to_hem = converter.email_converted_to_hem
|
|
2386
|
+
|
|
2387
|
+
non_feature_columns = [self.TARGET_NAME, EVAL_SET_INDEX, ENTITY_SYSTEM_RECORD_ID, SEARCH_KEY_UNNEST] + list(
|
|
2388
|
+
self.fit_search_keys.keys()
|
|
2389
|
+
)
|
|
2334
2390
|
if email_converted_to_hem:
|
|
2335
2391
|
non_feature_columns.append(email_column)
|
|
2336
2392
|
if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
|
|
@@ -2354,12 +2410,14 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2354
2410
|
**{str(c): FileColumnMeaningType.FEATURE for c in df.columns if c not in non_feature_columns},
|
|
2355
2411
|
}
|
|
2356
2412
|
meaning_types[self.TARGET_NAME] = FileColumnMeaningType.TARGET
|
|
2413
|
+
meaning_types[ENTITY_SYSTEM_RECORD_ID] = FileColumnMeaningType.ENTITY_SYSTEM_RECORD_ID
|
|
2414
|
+
if SEARCH_KEY_UNNEST in df.columns:
|
|
2415
|
+
meaning_types[SEARCH_KEY_UNNEST] = FileColumnMeaningType.UNNEST_KEY
|
|
2357
2416
|
if eval_set is not None and len(eval_set) > 0:
|
|
2358
2417
|
meaning_types[EVAL_SET_INDEX] = FileColumnMeaningType.EVAL_SET_INDEX
|
|
2359
2418
|
|
|
2360
|
-
df = self.__add_fit_system_record_id(df, meaning_types, self.fit_search_keys)
|
|
2419
|
+
df = self.__add_fit_system_record_id(df, meaning_types, self.fit_search_keys, SYSTEM_RECORD_ID)
|
|
2361
2420
|
|
|
2362
|
-
self.df_with_original_index = df.copy()
|
|
2363
2421
|
df = df.reset_index(drop=True).sort_values(by=SYSTEM_RECORD_ID).reset_index(drop=True)
|
|
2364
2422
|
|
|
2365
2423
|
combined_search_keys = combine_search_keys(self.fit_search_keys.keys())
|
|
@@ -2367,14 +2425,15 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2367
2425
|
dataset = Dataset(
|
|
2368
2426
|
"tds_" + str(uuid.uuid4()),
|
|
2369
2427
|
df=df,
|
|
2428
|
+
meaning_types=meaning_types,
|
|
2429
|
+
search_keys=combined_search_keys,
|
|
2430
|
+
unnest_search_keys=unnest_search_keys,
|
|
2370
2431
|
model_task_type=model_task_type,
|
|
2371
2432
|
date_format=self.date_format,
|
|
2372
2433
|
random_state=self.random_state,
|
|
2373
2434
|
rest_client=self.rest_client,
|
|
2374
2435
|
logger=self.logger,
|
|
2375
2436
|
)
|
|
2376
|
-
dataset.meaning_types = meaning_types
|
|
2377
|
-
dataset.search_keys = combined_search_keys
|
|
2378
2437
|
if email_converted_to_hem:
|
|
2379
2438
|
dataset.ignore_columns = [email_column]
|
|
2380
2439
|
|
|
@@ -2744,9 +2803,10 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2744
2803
|
X: pd.DataFrame, y: pd.Series, cv: Optional[CVType]
|
|
2745
2804
|
) -> Tuple[pd.DataFrame, pd.Series]:
|
|
2746
2805
|
if cv not in [CVType.time_series, CVType.blocked_time_series]:
|
|
2806
|
+
record_id_column = ENTITY_SYSTEM_RECORD_ID if ENTITY_SYSTEM_RECORD_ID in X else SYSTEM_RECORD_ID
|
|
2747
2807
|
Xy = X.copy()
|
|
2748
2808
|
Xy[TARGET] = y
|
|
2749
|
-
Xy = Xy.sort_values(by=
|
|
2809
|
+
Xy = Xy.sort_values(by=record_id_column).reset_index(drop=True)
|
|
2750
2810
|
X = Xy.drop(columns=TARGET)
|
|
2751
2811
|
y = Xy[TARGET].copy()
|
|
2752
2812
|
|
|
@@ -2925,15 +2985,19 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2925
2985
|
|
|
2926
2986
|
@staticmethod
|
|
2927
2987
|
def _get_email_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
|
|
2928
|
-
for col, t in search_keys.items()
|
|
2929
|
-
|
|
2930
|
-
|
|
2988
|
+
cols = [col for col, t in search_keys.items() if t == SearchKey.EMAIL]
|
|
2989
|
+
if len(cols) > 1:
|
|
2990
|
+
raise Exception("More than one email column found after unnest")
|
|
2991
|
+
if len(cols) == 1:
|
|
2992
|
+
return cols[0]
|
|
2931
2993
|
|
|
2932
2994
|
@staticmethod
|
|
2933
2995
|
def _get_hem_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
|
|
2934
|
-
for col, t in search_keys.items()
|
|
2935
|
-
|
|
2936
|
-
|
|
2996
|
+
cols = [col for col, t in search_keys.items() if t == SearchKey.HEM]
|
|
2997
|
+
if len(cols) > 1:
|
|
2998
|
+
raise Exception("More than one hem column found after unnest")
|
|
2999
|
+
if len(cols) == 1:
|
|
3000
|
+
return cols[0]
|
|
2937
3001
|
|
|
2938
3002
|
@staticmethod
|
|
2939
3003
|
def _get_phone_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
|
|
@@ -2941,8 +3005,44 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2941
3005
|
if t == SearchKey.PHONE:
|
|
2942
3006
|
return col
|
|
2943
3007
|
|
|
3008
|
+
def _explode_multiple_search_keys(
|
|
3009
|
+
self, df: pd.DataFrame, search_keys: Dict[str, SearchKey]
|
|
3010
|
+
) -> Tuple[pd.DataFrame, Dict[str, List[str]]]:
|
|
3011
|
+
# find groups of multiple search keys
|
|
3012
|
+
search_key_names_by_type: Dict[SearchKey, str] = {}
|
|
3013
|
+
for key_name, key_type in search_keys.items():
|
|
3014
|
+
search_key_names_by_type[key_type] = search_key_names_by_type.get(key_type, []) + [key_name]
|
|
3015
|
+
search_key_names_by_type = {
|
|
3016
|
+
key_type: key_names for key_type, key_names in search_key_names_by_type.items() if len(key_names) > 1
|
|
3017
|
+
}
|
|
3018
|
+
if len(search_key_names_by_type) == 0:
|
|
3019
|
+
return df, {}
|
|
3020
|
+
|
|
3021
|
+
multiple_keys_columns = [col for cols in search_key_names_by_type.values() for col in cols]
|
|
3022
|
+
other_columns = [col for col in df.columns if col not in multiple_keys_columns]
|
|
3023
|
+
exploded_dfs = []
|
|
3024
|
+
unnest_search_keys = {}
|
|
3025
|
+
|
|
3026
|
+
for key_type, key_names in search_key_names_by_type.items():
|
|
3027
|
+
new_search_key = f"upgini_{key_type.name.lower()}_unnest"
|
|
3028
|
+
exploded_df = pd.melt(
|
|
3029
|
+
df, id_vars=other_columns, value_vars=key_names, var_name=SEARCH_KEY_UNNEST, value_name=new_search_key
|
|
3030
|
+
)
|
|
3031
|
+
exploded_dfs.append(exploded_df)
|
|
3032
|
+
for old_key in key_names:
|
|
3033
|
+
del search_keys[old_key]
|
|
3034
|
+
search_keys[new_search_key] = key_type
|
|
3035
|
+
unnest_search_keys[new_search_key] = key_names
|
|
3036
|
+
|
|
3037
|
+
df = pd.concat(exploded_dfs, ignore_index=True)
|
|
3038
|
+
return df, unnest_search_keys
|
|
3039
|
+
|
|
2944
3040
|
def __add_fit_system_record_id(
|
|
2945
|
-
self,
|
|
3041
|
+
self,
|
|
3042
|
+
df: pd.DataFrame,
|
|
3043
|
+
meaning_types: Dict[str, FileColumnMeaningType],
|
|
3044
|
+
search_keys: Dict[str, SearchKey],
|
|
3045
|
+
id_name: str,
|
|
2946
3046
|
) -> pd.DataFrame:
|
|
2947
3047
|
# save original order or rows
|
|
2948
3048
|
original_index_name = df.index.name
|
|
@@ -2953,7 +3053,14 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2953
3053
|
|
|
2954
3054
|
# order by date and idempotent order by other keys
|
|
2955
3055
|
if self.cv not in [CVType.time_series, CVType.blocked_time_series]:
|
|
2956
|
-
sort_exclude_columns = [
|
|
3056
|
+
sort_exclude_columns = [
|
|
3057
|
+
original_order_name,
|
|
3058
|
+
ORIGINAL_INDEX,
|
|
3059
|
+
EVAL_SET_INDEX,
|
|
3060
|
+
TARGET,
|
|
3061
|
+
"__target",
|
|
3062
|
+
ENTITY_SYSTEM_RECORD_ID,
|
|
3063
|
+
]
|
|
2957
3064
|
if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
|
|
2958
3065
|
date_column = DateTimeSearchKeyConverter.DATETIME_COL
|
|
2959
3066
|
sort_exclude_columns.append(self._get_date_column(search_keys))
|
|
@@ -2991,14 +3098,18 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2991
3098
|
|
|
2992
3099
|
df = df.reset_index(drop=True).reset_index()
|
|
2993
3100
|
# system_record_id saves correct order for fit
|
|
2994
|
-
df = df.rename(columns={DEFAULT_INDEX:
|
|
3101
|
+
df = df.rename(columns={DEFAULT_INDEX: id_name})
|
|
2995
3102
|
|
|
2996
3103
|
# return original order
|
|
2997
3104
|
df = df.set_index(ORIGINAL_INDEX)
|
|
2998
3105
|
df.index.name = original_index_name
|
|
2999
3106
|
df = df.sort_values(by=original_order_name).drop(columns=original_order_name)
|
|
3000
3107
|
|
|
3001
|
-
meaning_types[
|
|
3108
|
+
meaning_types[id_name] = (
|
|
3109
|
+
FileColumnMeaningType.SYSTEM_RECORD_ID
|
|
3110
|
+
if id_name == SYSTEM_RECORD_ID
|
|
3111
|
+
else FileColumnMeaningType.ENTITY_SYSTEM_RECORD_ID
|
|
3112
|
+
)
|
|
3002
3113
|
return df
|
|
3003
3114
|
|
|
3004
3115
|
def __correct_target(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
@@ -3053,7 +3164,11 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3053
3164
|
)
|
|
3054
3165
|
|
|
3055
3166
|
comparing_columns = X.columns if is_transform else df_with_original_index.columns
|
|
3056
|
-
dup_features = [
|
|
3167
|
+
dup_features = [
|
|
3168
|
+
c
|
|
3169
|
+
for c in comparing_columns
|
|
3170
|
+
if c in result_features.columns and c not in [SYSTEM_RECORD_ID, ENTITY_SYSTEM_RECORD_ID]
|
|
3171
|
+
]
|
|
3057
3172
|
if len(dup_features) > 0:
|
|
3058
3173
|
self.logger.warning(f"X contain columns with same name as returned from backend: {dup_features}")
|
|
3059
3174
|
raise ValidationError(self.bundle.get("returned_features_same_as_passed").format(dup_features))
|
|
@@ -3064,8 +3179,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3064
3179
|
result_features = pd.merge(
|
|
3065
3180
|
df_with_original_index,
|
|
3066
3181
|
result_features,
|
|
3067
|
-
|
|
3068
|
-
right_on=SYSTEM_RECORD_ID,
|
|
3182
|
+
on=ENTITY_SYSTEM_RECORD_ID,
|
|
3069
3183
|
how="left" if is_transform else "inner",
|
|
3070
3184
|
)
|
|
3071
3185
|
result_features = result_features.set_index(original_index_name or DEFAULT_INDEX)
|
|
@@ -3076,7 +3190,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3076
3190
|
result_features = result_features[~result_features[SYSTEM_RECORD_ID].isin(rows_to_drop[SYSTEM_RECORD_ID])]
|
|
3077
3191
|
self.logger.info(f"After dropping target outliers size: {len(result_features)}")
|
|
3078
3192
|
|
|
3079
|
-
result_eval_sets =
|
|
3193
|
+
result_eval_sets = {}
|
|
3080
3194
|
if not is_transform and EVAL_SET_INDEX in result_features.columns:
|
|
3081
3195
|
result_train_features = result_features.loc[result_features[EVAL_SET_INDEX] == 0].copy()
|
|
3082
3196
|
eval_set_indices = list(result_features[EVAL_SET_INDEX].unique())
|
|
@@ -3288,7 +3402,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3288
3402
|
if autofe_feature.op.is_vector:
|
|
3289
3403
|
continue
|
|
3290
3404
|
|
|
3291
|
-
description =
|
|
3405
|
+
description = {}
|
|
3292
3406
|
|
|
3293
3407
|
feature_meta = get_feature_by_name(autofe_feature.get_display_name(shorten=True))
|
|
3294
3408
|
if feature_meta is None:
|
|
@@ -3454,13 +3568,13 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3454
3568
|
self.warning_counter.increment()
|
|
3455
3569
|
|
|
3456
3570
|
if len(valid_search_keys) == 1:
|
|
3457
|
-
|
|
3458
|
-
|
|
3459
|
-
|
|
3460
|
-
|
|
3461
|
-
|
|
3462
|
-
|
|
3463
|
-
|
|
3571
|
+
key, value = list(valid_search_keys.items())[0]
|
|
3572
|
+
# Show warning for country only if country is the only key
|
|
3573
|
+
if x[key].nunique() == 1:
|
|
3574
|
+
msg = self.bundle.get("single_constant_search_key").format(value, x[key].values[0])
|
|
3575
|
+
print(msg)
|
|
3576
|
+
self.logger.warning(msg)
|
|
3577
|
+
self.warning_counter.increment()
|
|
3464
3578
|
|
|
3465
3579
|
self.logger.info(f"Prepared search keys: {valid_search_keys}")
|
|
3466
3580
|
|
|
@@ -3570,61 +3684,68 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3570
3684
|
def check_need_detect(search_key: SearchKey):
|
|
3571
3685
|
return not is_transform or search_key in self.fit_search_keys.values()
|
|
3572
3686
|
|
|
3573
|
-
if SearchKey.POSTAL_CODE not in search_keys.values() and check_need_detect(SearchKey.POSTAL_CODE):
|
|
3574
|
-
|
|
3575
|
-
|
|
3576
|
-
|
|
3577
|
-
|
|
3578
|
-
|
|
3687
|
+
# if SearchKey.POSTAL_CODE not in search_keys.values() and check_need_detect(SearchKey.POSTAL_CODE):
|
|
3688
|
+
if check_need_detect(SearchKey.POSTAL_CODE):
|
|
3689
|
+
maybe_keys = PostalCodeSearchKeyDetector().get_search_key_columns(sample, search_keys)
|
|
3690
|
+
if maybe_keys:
|
|
3691
|
+
new_keys = {key: SearchKey.POSTAL_CODE for key in maybe_keys}
|
|
3692
|
+
search_keys.update(new_keys)
|
|
3693
|
+
self.autodetected_search_keys.update(new_keys)
|
|
3694
|
+
self.logger.info(f"Autodetected search key POSTAL_CODE in column {maybe_keys}")
|
|
3579
3695
|
if not silent_mode:
|
|
3580
|
-
print(self.bundle.get("postal_code_detected").format(
|
|
3696
|
+
print(self.bundle.get("postal_code_detected").format(maybe_keys))
|
|
3581
3697
|
|
|
3582
3698
|
if (
|
|
3583
3699
|
SearchKey.COUNTRY not in search_keys.values()
|
|
3584
3700
|
and self.country_code is None
|
|
3585
3701
|
and check_need_detect(SearchKey.COUNTRY)
|
|
3586
3702
|
):
|
|
3587
|
-
maybe_key = CountrySearchKeyDetector().
|
|
3588
|
-
if maybe_key
|
|
3589
|
-
search_keys[maybe_key] = SearchKey.COUNTRY
|
|
3590
|
-
self.autodetected_search_keys[maybe_key] = SearchKey.COUNTRY
|
|
3703
|
+
maybe_key = CountrySearchKeyDetector().get_search_key_columns(sample, search_keys)
|
|
3704
|
+
if maybe_key:
|
|
3705
|
+
search_keys[maybe_key[0]] = SearchKey.COUNTRY
|
|
3706
|
+
self.autodetected_search_keys[maybe_key[0]] = SearchKey.COUNTRY
|
|
3591
3707
|
self.logger.info(f"Autodetected search key COUNTRY in column {maybe_key}")
|
|
3592
3708
|
if not silent_mode:
|
|
3593
3709
|
print(self.bundle.get("country_detected").format(maybe_key))
|
|
3594
3710
|
|
|
3595
3711
|
if (
|
|
3596
|
-
SearchKey.EMAIL not in search_keys.values()
|
|
3597
|
-
|
|
3712
|
+
# SearchKey.EMAIL not in search_keys.values()
|
|
3713
|
+
SearchKey.HEM not in search_keys.values()
|
|
3598
3714
|
and check_need_detect(SearchKey.HEM)
|
|
3599
3715
|
):
|
|
3600
|
-
|
|
3601
|
-
if
|
|
3716
|
+
maybe_keys = EmailSearchKeyDetector().get_search_key_columns(sample, search_keys)
|
|
3717
|
+
if maybe_keys:
|
|
3602
3718
|
if self.__is_registered or is_demo_dataset:
|
|
3603
|
-
|
|
3604
|
-
|
|
3605
|
-
self.
|
|
3719
|
+
new_keys = {key: SearchKey.EMAIL for key in maybe_keys}
|
|
3720
|
+
search_keys.update(new_keys)
|
|
3721
|
+
self.autodetected_search_keys.update(new_keys)
|
|
3722
|
+
self.logger.info(f"Autodetected search key EMAIL in column {maybe_keys}")
|
|
3606
3723
|
if not silent_mode:
|
|
3607
|
-
print(self.bundle.get("email_detected").format(
|
|
3724
|
+
print(self.bundle.get("email_detected").format(maybe_keys))
|
|
3608
3725
|
else:
|
|
3609
3726
|
self.logger.warning(
|
|
3610
|
-
f"Autodetected search key EMAIL in column {
|
|
3727
|
+
f"Autodetected search key EMAIL in column {maybe_keys}."
|
|
3728
|
+
" But not used because not registered user"
|
|
3611
3729
|
)
|
|
3612
3730
|
if not silent_mode:
|
|
3613
|
-
print(self.bundle.get("email_detected_not_registered").format(
|
|
3731
|
+
print(self.bundle.get("email_detected_not_registered").format(maybe_keys))
|
|
3614
3732
|
self.warning_counter.increment()
|
|
3615
3733
|
|
|
3616
|
-
if SearchKey.PHONE not in search_keys.values() and check_need_detect(SearchKey.PHONE):
|
|
3617
|
-
|
|
3618
|
-
|
|
3734
|
+
# if SearchKey.PHONE not in search_keys.values() and check_need_detect(SearchKey.PHONE):
|
|
3735
|
+
if check_need_detect(SearchKey.PHONE):
|
|
3736
|
+
maybe_keys = PhoneSearchKeyDetector().get_search_key_columns(sample, search_keys)
|
|
3737
|
+
if maybe_keys:
|
|
3619
3738
|
if self.__is_registered or is_demo_dataset:
|
|
3620
|
-
|
|
3621
|
-
|
|
3622
|
-
self.
|
|
3739
|
+
new_keys = {key: SearchKey.PHONE for key in maybe_keys}
|
|
3740
|
+
search_keys.update(new_keys)
|
|
3741
|
+
self.autodetected_search_keys.update(new_keys)
|
|
3742
|
+
self.logger.info(f"Autodetected search key PHONE in column {maybe_keys}")
|
|
3623
3743
|
if not silent_mode:
|
|
3624
|
-
print(self.bundle.get("phone_detected").format(
|
|
3744
|
+
print(self.bundle.get("phone_detected").format(maybe_keys))
|
|
3625
3745
|
else:
|
|
3626
3746
|
self.logger.warning(
|
|
3627
|
-
f"Autodetected search key PHONE in column {
|
|
3747
|
+
f"Autodetected search key PHONE in column {maybe_keys}. "
|
|
3748
|
+
"But not used because not registered user"
|
|
3628
3749
|
)
|
|
3629
3750
|
if not silent_mode:
|
|
3630
3751
|
print(self.bundle.get("phone_detected_not_registered"))
|
|
@@ -6,6 +6,8 @@ from typing import Dict, List, Optional, Set
|
|
|
6
6
|
from pydantic import BaseModel
|
|
7
7
|
|
|
8
8
|
SYSTEM_RECORD_ID = "system_record_id"
|
|
9
|
+
ENTITY_SYSTEM_RECORD_ID = "entity_system_record_id"
|
|
10
|
+
SEARCH_KEY_UNNEST = "search_key_unnest"
|
|
9
11
|
SORT_ID = "sort_id"
|
|
10
12
|
EVAL_SET_INDEX = "eval_set_index"
|
|
11
13
|
TARGET = "target"
|
|
@@ -13,7 +15,7 @@ COUNTRY = "country_iso_code"
|
|
|
13
15
|
RENAMED_INDEX = "index_col"
|
|
14
16
|
DEFAULT_INDEX = "index"
|
|
15
17
|
ORIGINAL_INDEX = "original_index"
|
|
16
|
-
SYSTEM_COLUMNS = {SYSTEM_RECORD_ID, EVAL_SET_INDEX, TARGET, COUNTRY
|
|
18
|
+
SYSTEM_COLUMNS = {SYSTEM_RECORD_ID, ENTITY_SYSTEM_RECORD_ID, SEARCH_KEY_UNNEST, EVAL_SET_INDEX, TARGET, COUNTRY}
|
|
17
19
|
|
|
18
20
|
|
|
19
21
|
class FileColumnMeaningType(Enum):
|
|
@@ -39,6 +41,8 @@ class FileColumnMeaningType(Enum):
|
|
|
39
41
|
POSTAL_CODE = "POSTAL_CODE"
|
|
40
42
|
SYSTEM_RECORD_ID = "SYSTEM_RECORD_ID"
|
|
41
43
|
EVAL_SET_INDEX = "EVAL_SET_INDEX"
|
|
44
|
+
ENTITY_SYSTEM_RECORD_ID = "ENTITY_SYSTEM_RECORD_ID"
|
|
45
|
+
UNNEST_KEY = "UNNEST_KEY"
|
|
42
46
|
|
|
43
47
|
|
|
44
48
|
class SearchKey(Enum):
|
|
@@ -184,6 +188,10 @@ class FileColumnMetadata(BaseModel):
|
|
|
184
188
|
meaningType: FileColumnMeaningType
|
|
185
189
|
minMaxValues: Optional[NumericInterval] = None
|
|
186
190
|
originalName: Optional[str]
|
|
191
|
+
# is this column contains keys from multiple key columns like msisdn1, msisdn2
|
|
192
|
+
isUnnest: bool = False
|
|
193
|
+
# list of original etalon key column names like msisdn1, msisdn2
|
|
194
|
+
unnestKeyNames: Optional[list[str]]
|
|
187
195
|
|
|
188
196
|
|
|
189
197
|
class FileMetadata(BaseModel):
|
|
@@ -281,7 +289,7 @@ class FeaturesFilter(BaseModel):
|
|
|
281
289
|
|
|
282
290
|
|
|
283
291
|
class RuntimeParameters(BaseModel):
|
|
284
|
-
properties: Dict[str, str] =
|
|
292
|
+
properties: Dict[str, str] = {}
|
|
285
293
|
|
|
286
294
|
|
|
287
295
|
class SearchCustomization(BaseModel):
|
|
@@ -369,7 +369,7 @@ class EstimatorWrapper:
|
|
|
369
369
|
"logger": logger,
|
|
370
370
|
}
|
|
371
371
|
if estimator is None:
|
|
372
|
-
params =
|
|
372
|
+
params = {}
|
|
373
373
|
params["has_time"] = has_date
|
|
374
374
|
# if metric_name.upper() in SUPPORTED_CATBOOST_METRICS:
|
|
375
375
|
# params["eval_metric"] = SUPPORTED_CATBOOST_METRICS[metric_name.upper()]
|
|
@@ -88,6 +88,7 @@ unsupported_search_key_type=Unsupported type of key in search_keys: {}
|
|
|
88
88
|
search_key_country_and_country_code=\nWARNING: SearchKey.COUNTRY and country_code parameter were passed simultaniously. Parameter country_code will be ignored
|
|
89
89
|
empty_search_key=Search key {} is empty. Please fill values or remove this search key
|
|
90
90
|
single_constant_search_key=\nWARNING: Constant value detected for the {} search key in the X dataframe: {}.\nThat search key will add constant features for different y values.\nPlease add extra search keys with non constant values, like the COUNTRY, POSTAL_CODE, DATE, PHONE NUMBER, EMAIL/HEM or IPv4
|
|
91
|
+
unsupported_multi_key=Search key {} cannot be used multiple times
|
|
91
92
|
unsupported_index_column=\nWARNING: Your column with name `index` was dropped because it's reserved name is booked for system needs.
|
|
92
93
|
date_string_without_format=Date column `{}` has string type, but date_format is not specified. Convert column to datetime type or pass date_format
|
|
93
94
|
invalid_date_format=Failed to parse date in column `{}`. Try to pass explicit date format in date_format argument of FeaturesEnricher constructor
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
from typing import List
|
|
2
|
+
|
|
3
|
+
import pandas as pd
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class BaseSearchKeyDetector:
|
|
7
|
+
def _is_search_key_by_name(self, column_name: str) -> bool:
|
|
8
|
+
raise NotImplementedError
|
|
9
|
+
|
|
10
|
+
def _is_search_key_by_values(self, column: pd.Series) -> bool:
|
|
11
|
+
raise NotImplementedError
|
|
12
|
+
|
|
13
|
+
def _get_search_keys_by_name(self, column_names: List[str]) -> List[str]:
|
|
14
|
+
return [
|
|
15
|
+
column_name
|
|
16
|
+
for column_name in column_names
|
|
17
|
+
if self._is_search_key_by_name(column_name)
|
|
18
|
+
]
|
|
19
|
+
|
|
20
|
+
def get_search_key_columns(self, df: pd.DataFrame, existing_search_keys: List[str]) -> List[str]:
|
|
21
|
+
other_columns = [col for col in df.columns if col not in existing_search_keys]
|
|
22
|
+
columns_by_names = self._get_search_keys_by_name(other_columns)
|
|
23
|
+
columns_by_values = []
|
|
24
|
+
for column_name in other_columns:
|
|
25
|
+
if self._is_search_key_by_values(df[column_name]):
|
|
26
|
+
columns_by_values.append(column_name)
|
|
27
|
+
return list(set(columns_by_names + columns_by_values))
|
|
@@ -3,7 +3,15 @@ from typing import Dict, List, Optional, Union
|
|
|
3
3
|
|
|
4
4
|
import pandas as pd
|
|
5
5
|
|
|
6
|
-
from upgini.metadata import
|
|
6
|
+
from upgini.metadata import (
|
|
7
|
+
ENTITY_SYSTEM_RECORD_ID,
|
|
8
|
+
EVAL_SET_INDEX,
|
|
9
|
+
SORT_ID,
|
|
10
|
+
SYSTEM_RECORD_ID,
|
|
11
|
+
TARGET,
|
|
12
|
+
ModelTaskType,
|
|
13
|
+
SearchKey,
|
|
14
|
+
)
|
|
7
15
|
from upgini.resource_bundle import ResourceBundle
|
|
8
16
|
from upgini.utils.datetime_utils import DateTimeSearchKeyConverter
|
|
9
17
|
from upgini.utils.target_utils import define_task
|
|
@@ -143,6 +151,8 @@ def clean_full_duplicates(
|
|
|
143
151
|
unique_columns = df.columns.tolist()
|
|
144
152
|
if SYSTEM_RECORD_ID in unique_columns:
|
|
145
153
|
unique_columns.remove(SYSTEM_RECORD_ID)
|
|
154
|
+
if ENTITY_SYSTEM_RECORD_ID in unique_columns:
|
|
155
|
+
unique_columns.remove(ENTITY_SYSTEM_RECORD_ID)
|
|
146
156
|
if SORT_ID in unique_columns:
|
|
147
157
|
unique_columns.remove(SORT_ID)
|
|
148
158
|
if EVAL_SET_INDEX in unique_columns:
|
|
@@ -38,11 +38,13 @@ class EmailSearchKeyConverter:
|
|
|
38
38
|
email_column: str,
|
|
39
39
|
hem_column: Optional[str],
|
|
40
40
|
search_keys: Dict[str, SearchKey],
|
|
41
|
+
unnest_search_keys: Optional[List[str]] = None,
|
|
41
42
|
logger: Optional[logging.Logger] = None,
|
|
42
43
|
):
|
|
43
44
|
self.email_column = email_column
|
|
44
45
|
self.hem_column = hem_column
|
|
45
46
|
self.search_keys = search_keys
|
|
47
|
+
self.unnest_search_keys = unnest_search_keys
|
|
46
48
|
if logger is not None:
|
|
47
49
|
self.logger = logger
|
|
48
50
|
else:
|
|
@@ -80,9 +82,12 @@ class EmailSearchKeyConverter:
|
|
|
80
82
|
del self.search_keys[self.email_column]
|
|
81
83
|
return df
|
|
82
84
|
self.search_keys[self.HEM_COLUMN_NAME] = SearchKey.HEM
|
|
85
|
+
self.unnest_search_keys.append(self.HEM_COLUMN_NAME)
|
|
83
86
|
self.email_converted_to_hem = True
|
|
84
87
|
|
|
85
88
|
del self.search_keys[self.email_column]
|
|
89
|
+
if self.email_column in self.unnest_search_keys:
|
|
90
|
+
self.unnest_search_keys.remove(self.email_column)
|
|
86
91
|
|
|
87
92
|
df[self.EMAIL_ONE_DOMAIN_COLUMN_NAME] = df[self.email_column].apply(self._email_to_one_domain)
|
|
88
93
|
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
__version__ = "1.1.298"
|
|
@@ -1,25 +0,0 @@
|
|
|
1
|
-
from typing import List, Optional
|
|
2
|
-
|
|
3
|
-
import pandas as pd
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
class BaseSearchKeyDetector:
|
|
7
|
-
def _is_search_key_by_name(self, column_name: str) -> bool:
|
|
8
|
-
raise NotImplementedError
|
|
9
|
-
|
|
10
|
-
def _is_search_key_by_values(self, column: pd.Series) -> bool:
|
|
11
|
-
raise NotImplementedError
|
|
12
|
-
|
|
13
|
-
def _get_search_key_by_name(self, column_names: List[str]) -> Optional[str]:
|
|
14
|
-
for column_name in column_names:
|
|
15
|
-
if self._is_search_key_by_name(column_name):
|
|
16
|
-
return column_name
|
|
17
|
-
|
|
18
|
-
def get_search_key_column(self, df: pd.DataFrame) -> Optional[str]:
|
|
19
|
-
maybe_column = self._get_search_key_by_name(df.columns.to_list())
|
|
20
|
-
if maybe_column is not None:
|
|
21
|
-
return maybe_column
|
|
22
|
-
|
|
23
|
-
for column_name in df.columns:
|
|
24
|
-
if self._is_search_key_by_values(df[column_name]):
|
|
25
|
-
return column_name
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|