upgini 1.1.297__py3-none-any.whl → 1.1.299__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
upgini/__about__.py CHANGED
@@ -1 +1 @@
1
- __version__ = "1.1.297"
1
+ __version__ = "1.1.299"
@@ -1,7 +1,14 @@
1
1
  from typing import Dict
2
2
 
3
3
  from upgini.autofe.binary import Add, Divide, Max, Min, Multiply, Sim, Subtract
4
- from upgini.autofe.date import DateDiff, DateDiffType2, DateListDiff, DateListDiffBounded, DatePercentile
4
+ from upgini.autofe.date import (
5
+ DateDiff,
6
+ DateDiffType2,
7
+ DateListDiff,
8
+ DateListDiffBounded,
9
+ DatePercentile,
10
+ DatePercentileMethod2,
11
+ )
5
12
  from upgini.autofe.groupby import GroupByThenAgg, GroupByThenRank
6
13
  from upgini.autofe.operand import Operand
7
14
  from upgini.autofe.unary import Abs, Floor, Freq, Log, Residual, Norm, Sigmoid, Sqrt, Square
@@ -50,6 +57,7 @@ ALL_OPERANDS: Dict[str, Operand] = {
50
57
  DateListDiffBounded(diff_unit="Y", aggregation="count", lower_bound=45, upper_bound=60),
51
58
  DateListDiffBounded(diff_unit="Y", aggregation="count", lower_bound=60),
52
59
  DatePercentile(),
60
+ DatePercentileMethod2(),
53
61
  Norm(),
54
62
  ]
55
63
  }
upgini/autofe/date.py CHANGED
@@ -1,3 +1,4 @@
1
+ import abc
1
2
  from typing import Any, Dict, List, Optional, Union
2
3
 
3
4
  import numpy as np
@@ -38,6 +39,7 @@ class DateDiffMixin(BaseModel):
38
39
 
39
40
  class DateDiff(PandasOperand, DateDiffMixin):
40
41
  name = "date_diff"
42
+ alias = "date_diff_type1"
41
43
  is_binary = True
42
44
  has_symmetry_importance = True
43
45
 
@@ -159,12 +161,45 @@ class DateListDiffBounded(DateListDiff):
159
161
  return super()._agg(x)
160
162
 
161
163
 
162
- class DatePercentile(PandasOperand):
163
- name = "date_per"
164
+ class DatePercentileBase(PandasOperand, abc.ABC):
164
165
  is_binary = True
165
166
  output_type = "float"
166
167
 
167
168
  date_unit: Optional[str] = None
169
+
170
+ def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
171
+ # Assuming that left is a date column, right is a feature column
172
+ left = pd.to_datetime(left, unit=self.date_unit)
173
+
174
+ bounds = self._get_bounds(left)
175
+
176
+ return right.index.to_series().apply(lambda i: self._perc(right[i], bounds[i]))
177
+
178
+ @abc.abstractmethod
179
+ def _get_bounds(self, date_col: pd.Series) -> pd.Series:
180
+ pass
181
+
182
+ def _perc(self, f, bounds):
183
+ hit = np.where(f >= bounds)[0]
184
+ if hit.size > 0:
185
+ return np.max(hit) + 1
186
+ else:
187
+ return np.nan
188
+
189
+ def get_params(self) -> Dict[str, Optional[str]]:
190
+ res = super().get_params()
191
+ res.update(
192
+ {
193
+ "date_unit": self.date_unit,
194
+ }
195
+ )
196
+ return res
197
+
198
+
199
+ class DatePercentile(DatePercentileBase):
200
+ name = "date_per"
201
+ alias = "date_per_method1"
202
+
168
203
  zero_month: Optional[int]
169
204
  zero_year: Optional[int]
170
205
  zero_bounds: Optional[List[float]]
@@ -174,7 +209,6 @@ class DatePercentile(PandasOperand):
174
209
  res = super().get_params()
175
210
  res.update(
176
211
  {
177
- "date_unit": self.date_unit,
178
212
  "zero_month": self.zero_month,
179
213
  "zero_year": self.zero_year,
180
214
  "zero_bounds": self.zero_bounds,
@@ -190,22 +224,18 @@ class DatePercentile(PandasOperand):
190
224
  elif isinstance(value, str):
191
225
  return value[1:-1].split(", ")
192
226
 
193
- def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
194
- # Assuming that left is a date column, right is a feature column
195
- left = pd.to_datetime(left, unit=self.date_unit)
196
- months = left.dt.month
197
- years = left.dt.year
227
+ def _get_bounds(self, date_col: pd.Series) -> pd.Series:
228
+ months = date_col.dt.month
229
+ years = date_col.dt.year
198
230
 
199
231
  month_diffs = 12 * (years - (self.zero_year or 0)) + (months - (self.zero_month or 0))
200
- bounds = month_diffs.apply(
232
+ return month_diffs.apply(
201
233
  lambda d: np.array(self.zero_bounds if self.zero_bounds is not None else []) + d * self.step
202
234
  )
203
235
 
204
- return right.index.to_series().apply(lambda i: self.__perc(right[i], bounds[i]))
205
236
 
206
- def __perc(self, f, bounds):
207
- hit = np.where(f >= bounds)[0]
208
- if hit.size > 0:
209
- return np.max(hit) + 1
210
- else:
211
- return np.nan
237
+ class DatePercentileMethod2(DatePercentileBase):
238
+ name = "date_per_method2"
239
+
240
+ def _get_bounds(self, date_col: pd.Series) -> pd.Series:
241
+ pass
upgini/autofe/feature.py CHANGED
@@ -41,7 +41,7 @@ class Column:
41
41
  def get_column_nodes(self) -> List["Column"]:
42
42
  return [self]
43
43
 
44
- def get_columns(self) -> List[str]:
44
+ def get_columns(self, **kwargs) -> List[str]:
45
45
  return [self.name]
46
46
 
47
47
  def infer_type(self, data: pd.DataFrame) -> DtypeObj:
@@ -57,6 +57,12 @@ class Column:
57
57
  def to_pretty_formula(self) -> str:
58
58
  return self.to_formula()
59
59
 
60
+ def __eq__(self, value: object) -> bool:
61
+ if not isinstance(value, Column):
62
+ return False
63
+ else:
64
+ return self.name == value.name and self.calculate_all == value.calculate_all
65
+
60
66
 
61
67
  class Feature:
62
68
  def __init__(
@@ -125,6 +131,9 @@ class Feature:
125
131
  for child in self.children:
126
132
  child.delete_data()
127
133
 
134
+ def get_op_display_name(self) -> str:
135
+ return self.op.alias or self.op.name.lower()
136
+
128
137
  def get_display_name(self, cache: bool = True, shorten: bool = False, **kwargs) -> str:
129
138
  if self.cached_display_name is not None and cache:
130
139
  return self.cached_display_name
@@ -132,11 +141,11 @@ class Feature:
132
141
  if self.alias:
133
142
  components = ["f_autofe", self.alias]
134
143
  elif shorten and not self.op.is_unary:
135
- components = ["f_autofe", self.op.alias or self.op.name.lower()]
144
+ components = ["f_autofe", self.get_op_display_name()]
136
145
  else:
137
146
  components = ["f_" + "_f_".join(self.get_columns(**kwargs))] + [
138
147
  "autofe",
139
- self.op.alias or self.op.name.lower(),
148
+ self.get_op_display_name(),
140
149
  ]
141
150
  components.extend([str(self.display_index)] if self.display_index is not None else [])
142
151
  display_name = "_".join(components)
@@ -306,8 +315,21 @@ class FeatureGroup:
306
315
  main_column = None if self.main_column_node is None else self.main_column_node.get_columns()[0]
307
316
  if isinstance(self.op, PandasOperand):
308
317
  columns = self.get_columns()
309
- new_data = self.op.calculate_group(data[columns], main_column=main_column)
310
- new_data.rename(columns=dict(zip(columns, self.get_display_names())), inplace=True)
318
+ lower_order_children = [
319
+ ch for f in self.children for ch in f.children if ch.get_display_name() != main_column
320
+ ]
321
+ lower_order_names = [ch.get_display_name() for ch in lower_order_children]
322
+ if any(isinstance(f, Feature) for f in lower_order_children):
323
+ child_data = pd.concat(
324
+ [data[main_column]] + [ch.calculate(data) for ch in lower_order_children],
325
+ axis=1,
326
+ )
327
+ child_data.columns = [main_column] + lower_order_names
328
+ else:
329
+ child_data = data[columns]
330
+
331
+ new_data = self.op.calculate_group(child_data, main_column=main_column)
332
+ new_data.rename(columns=dict(zip(lower_order_names, self.get_display_names())), inplace=True)
311
333
  else:
312
334
  raise NotImplementedError(f"Unrecognized operator {self.op.name}.")
313
335
 
upgini/dataset.py CHANGED
@@ -23,7 +23,9 @@ from pandas.api.types import (
23
23
  from upgini.errors import ValidationError
24
24
  from upgini.http import ProgressStage, SearchProgress, _RestClient
25
25
  from upgini.metadata import (
26
+ ENTITY_SYSTEM_RECORD_ID,
26
27
  EVAL_SET_INDEX,
28
+ SEARCH_KEY_UNNEST,
27
29
  SYSTEM_COLUMNS,
28
30
  SYSTEM_RECORD_ID,
29
31
  TARGET,
@@ -79,6 +81,7 @@ class Dataset: # (pd.DataFrame):
79
81
  path: Optional[str] = None,
80
82
  meaning_types: Optional[Dict[str, FileColumnMeaningType]] = None,
81
83
  search_keys: Optional[List[Tuple[str, ...]]] = None,
84
+ unnest_search_keys: Optional[Dict[str, str]] = None,
82
85
  model_task_type: Optional[ModelTaskType] = None,
83
86
  random_state: Optional[int] = None,
84
87
  rest_client: Optional[_RestClient] = None,
@@ -113,6 +116,7 @@ class Dataset: # (pd.DataFrame):
113
116
  self.description = description
114
117
  self.meaning_types = meaning_types
115
118
  self.search_keys = search_keys
119
+ self.unnest_search_keys = unnest_search_keys
116
120
  self.ignore_columns = []
117
121
  self.hierarchical_group_keys = []
118
122
  self.hierarchical_subgroup_keys = []
@@ -172,7 +176,7 @@ class Dataset: # (pd.DataFrame):
172
176
  new_columns = []
173
177
  dup_counter = 0
174
178
  for column in self.data.columns:
175
- if column in [TARGET, EVAL_SET_INDEX, SYSTEM_RECORD_ID]:
179
+ if column in [TARGET, EVAL_SET_INDEX, SYSTEM_RECORD_ID, ENTITY_SYSTEM_RECORD_ID, SEARCH_KEY_UNNEST]:
176
180
  self.columns_renaming[column] = column
177
181
  new_columns.append(column)
178
182
  continue
@@ -353,7 +357,9 @@ class Dataset: # (pd.DataFrame):
353
357
 
354
358
  if is_string_dtype(self.data[postal_code]) or is_object_dtype(self.data[postal_code]):
355
359
  try:
356
- self.data[postal_code] = self.data[postal_code].astype("float64").astype("Int64").astype("string")
360
+ self.data[postal_code] = (
361
+ self.data[postal_code].astype("string").astype("Float64").astype("Int64").astype("string")
362
+ )
357
363
  except Exception:
358
364
  pass
359
365
  elif is_float_dtype(self.data[postal_code]):
@@ -803,6 +809,9 @@ class Dataset: # (pd.DataFrame):
803
809
  meaningType=meaning_type,
804
810
  minMaxValues=min_max_values,
805
811
  )
812
+ if self.unnest_search_keys and column_meta.originalName in self.unnest_search_keys:
813
+ column_meta.isUnnest = True
814
+ column_meta.unnestKeyNames = self.unnest_search_keys[column_meta.originalName]
806
815
 
807
816
  columns.append(column_meta)
808
817
 
@@ -11,6 +11,7 @@ import sys
11
11
  import tempfile
12
12
  import time
13
13
  import uuid
14
+ from collections import Counter
14
15
  from dataclasses import dataclass
15
16
  from threading import Thread
16
17
  from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union
@@ -45,9 +46,11 @@ from upgini.mdc import MDC
45
46
  from upgini.metadata import (
46
47
  COUNTRY,
47
48
  DEFAULT_INDEX,
49
+ ENTITY_SYSTEM_RECORD_ID,
48
50
  EVAL_SET_INDEX,
49
51
  ORIGINAL_INDEX,
50
52
  RENAMED_INDEX,
53
+ SEARCH_KEY_UNNEST,
51
54
  SORT_ID,
52
55
  SYSTEM_RECORD_ID,
53
56
  TARGET,
@@ -248,7 +251,7 @@ class FeaturesEnricher(TransformerMixin):
248
251
  self.__cached_sampled_datasets: Optional[Tuple[pd.DataFrame, pd.DataFrame, pd.Series, Dict, Dict]] = None
249
252
 
250
253
  validate_version(self.logger)
251
- self.search_keys = search_keys or dict()
254
+ self.search_keys = search_keys or {}
252
255
  self.country_code = country_code
253
256
  self.__validate_search_keys(search_keys, search_id)
254
257
  self.model_task_type = model_task_type
@@ -1200,7 +1203,7 @@ class FeaturesEnricher(TransformerMixin):
1200
1203
  email_column = self._get_email_column(search_keys)
1201
1204
  hem_column = self._get_hem_column(search_keys)
1202
1205
  if email_column:
1203
- converter = EmailSearchKeyConverter(email_column, hem_column, search_keys, self.logger)
1206
+ converter = EmailSearchKeyConverter(email_column, hem_column, search_keys, [], self.logger)
1204
1207
  extended_X = converter.convert(extended_X)
1205
1208
  generated_features.extend(converter.generated_features)
1206
1209
  if (
@@ -1353,7 +1356,7 @@ class FeaturesEnricher(TransformerMixin):
1353
1356
  not in (
1354
1357
  excluding_search_keys
1355
1358
  + list(self.fit_dropped_features)
1356
- + [DateTimeSearchKeyConverter.DATETIME_COL, SYSTEM_RECORD_ID]
1359
+ + [DateTimeSearchKeyConverter.DATETIME_COL, SYSTEM_RECORD_ID, ENTITY_SYSTEM_RECORD_ID]
1357
1360
  )
1358
1361
  ]
1359
1362
 
@@ -1417,7 +1420,7 @@ class FeaturesEnricher(TransformerMixin):
1417
1420
  fitting_enriched_X[col].astype("string").str.replace(",", ".", regex=False).astype(np.float64)
1418
1421
  )
1419
1422
 
1420
- fitting_eval_set_dict = dict()
1423
+ fitting_eval_set_dict = {}
1421
1424
  for idx, eval_tuple in eval_set_sampled_dict.items():
1422
1425
  eval_X_sampled, enriched_eval_X, eval_y_sampled = eval_tuple
1423
1426
  eval_X_sorted, eval_y_sorted = self._sort_by_system_record_id(eval_X_sampled, eval_y_sampled, self.cv)
@@ -1534,7 +1537,7 @@ class FeaturesEnricher(TransformerMixin):
1534
1537
  def __sample_only_input(
1535
1538
  self, validated_X: pd.DataFrame, validated_y: pd.Series, eval_set: Optional[List[tuple]], is_demo_dataset: bool
1536
1539
  ) -> _SampledDataForMetrics:
1537
- eval_set_sampled_dict = dict()
1540
+ eval_set_sampled_dict = {}
1538
1541
 
1539
1542
  df = validated_X.copy()
1540
1543
  df[TARGET] = validated_y
@@ -1560,7 +1563,7 @@ class FeaturesEnricher(TransformerMixin):
1560
1563
  df = df.sample(n=sample_rows, random_state=self.random_state)
1561
1564
 
1562
1565
  df_extended, search_keys = self._extend_x(df, is_demo_dataset)
1563
- df_extended = self.__add_fit_system_record_id(df_extended, dict(), search_keys)
1566
+ df_extended = self.__add_fit_system_record_id(df_extended, {}, search_keys, SYSTEM_RECORD_ID)
1564
1567
 
1565
1568
  train_df = df_extended.query(f"{EVAL_SET_INDEX} == 0") if eval_set is not None else df_extended
1566
1569
  X_sampled = train_df.drop(columns=[TARGET, EVAL_SET_INDEX], errors="ignore")
@@ -1584,7 +1587,7 @@ class FeaturesEnricher(TransformerMixin):
1584
1587
  trace_id: str,
1585
1588
  remove_outliers_calc_metrics: Optional[bool],
1586
1589
  ) -> _SampledDataForMetrics:
1587
- eval_set_sampled_dict = dict()
1590
+ eval_set_sampled_dict = {}
1588
1591
  search_keys = self.fit_search_keys
1589
1592
 
1590
1593
  rows_to_drop = None
@@ -1658,7 +1661,7 @@ class FeaturesEnricher(TransformerMixin):
1658
1661
  progress_bar: Optional[ProgressBar],
1659
1662
  progress_callback: Optional[Callable[[SearchProgress], Any]],
1660
1663
  ) -> _SampledDataForMetrics:
1661
- eval_set_sampled_dict = dict()
1664
+ eval_set_sampled_dict = {}
1662
1665
  if eval_set is not None:
1663
1666
  self.logger.info("Transform with eval_set")
1664
1667
  # concatenate X and eval_set with eval_set_index
@@ -1680,7 +1683,7 @@ class FeaturesEnricher(TransformerMixin):
1680
1683
  self.logger.info(f"Downsampling from {num_samples} to {Dataset.FIT_SAMPLE_WITH_EVAL_SET_ROWS}")
1681
1684
  df = df.sample(n=Dataset.FIT_SAMPLE_WITH_EVAL_SET_ROWS, random_state=self.random_state)
1682
1685
 
1683
- eval_set_sampled_dict = dict()
1686
+ eval_set_sampled_dict = {}
1684
1687
 
1685
1688
  tmp_target_name = "__target"
1686
1689
  df = df.rename(columns={TARGET: tmp_target_name})
@@ -1943,11 +1946,38 @@ class FeaturesEnricher(TransformerMixin):
1943
1946
  self.logger.info("Input dataset hasn't date column")
1944
1947
  if self.add_date_if_missing:
1945
1948
  df = self._add_current_date_as_key(df, search_keys, self.logger, self.bundle)
1949
+
1950
+ # Don't pass all features in backend on transform
1951
+ original_features_for_transform = []
1952
+ runtime_parameters = self._get_copy_of_runtime_parameters()
1953
+ features_not_to_pass = [column for column in df.columns if column not in search_keys.keys()]
1954
+ if len(features_not_to_pass) > 0:
1955
+ # Pass only features that need for transform
1956
+ features_for_transform = self._search_task.get_features_for_transform()
1957
+ if features_for_transform is not None and len(features_for_transform) > 0:
1958
+ file_metadata = self._search_task.get_file_metadata(trace_id)
1959
+ original_features_for_transform = [
1960
+ c.originalName or c.name for c in file_metadata.columns if c.name in features_for_transform
1961
+ ]
1962
+
1963
+ runtime_parameters.properties["features_for_embeddings"] = ",".join(features_for_transform)
1964
+
1965
+ columns_for_system_record_id = sorted(list(search_keys.keys()) + (original_features_for_transform))
1966
+
1967
+ df[ENTITY_SYSTEM_RECORD_ID] = pd.util.hash_pandas_object(
1968
+ df[columns_for_system_record_id], index=False
1969
+ ).astype("Float64")
1970
+
1971
+ # Explode multiple search keys
1972
+ df, unnest_search_keys = self._explode_multiple_search_keys(df, search_keys)
1973
+
1946
1974
  email_column = self._get_email_column(search_keys)
1947
1975
  hem_column = self._get_hem_column(search_keys)
1948
1976
  email_converted_to_hem = False
1949
1977
  if email_column:
1950
- converter = EmailSearchKeyConverter(email_column, hem_column, search_keys, self.logger)
1978
+ converter = EmailSearchKeyConverter(
1979
+ email_column, hem_column, search_keys, list(unnest_search_keys.keys()), self.logger
1980
+ )
1951
1981
  df = converter.convert(df)
1952
1982
  generated_features.extend(converter.generated_features)
1953
1983
  email_converted_to_hem = converter.email_converted_to_hem
@@ -1961,30 +1991,21 @@ class FeaturesEnricher(TransformerMixin):
1961
1991
  generated_features = [f for f in generated_features if f in self.fit_generated_features]
1962
1992
 
1963
1993
  meaning_types = {col: key.value for col, key in search_keys.items()}
1964
- non_keys_columns = [column for column in df.columns if column not in search_keys.keys()]
1994
+ # non_keys_columns = [column for column in df.columns if column not in search_keys.keys()]
1995
+ for col in original_features_for_transform:
1996
+ meaning_types[col] = FileColumnMeaningType.FEATURE
1997
+ features_not_to_pass = [column for column in features_not_to_pass if column not in search_keys.keys()]
1965
1998
 
1966
1999
  if email_converted_to_hem:
1967
- non_keys_columns.append(email_column)
2000
+ features_not_to_pass.append(email_column)
1968
2001
 
1969
- # Don't pass features in backend on transform
1970
- original_features_for_transform = None
1971
- runtime_parameters = self._get_copy_of_runtime_parameters()
1972
- if len(non_keys_columns) > 0:
1973
- # Pass only features that need for transform
1974
- features_for_transform = self._search_task.get_features_for_transform()
1975
- if features_for_transform is not None and len(features_for_transform) > 0:
1976
- file_metadata = self._search_task.get_file_metadata(trace_id)
1977
- original_features_for_transform = [
1978
- c.originalName or c.name for c in file_metadata.columns if c.name in features_for_transform
1979
- ]
1980
- non_keys_columns = [c for c in non_keys_columns if c not in original_features_for_transform]
1981
-
1982
- runtime_parameters.properties["features_for_embeddings"] = ",".join(features_for_transform)
2002
+ features_not_to_pass = [c for c in features_not_to_pass if c not in original_features_for_transform]
2003
+ columns_for_system_record_id = sorted(list(search_keys.keys()) + (original_features_for_transform))
1983
2004
 
1984
2005
  if add_fit_system_record_id:
1985
- df = self.__add_fit_system_record_id(df, dict(), search_keys)
2006
+ df = self.__add_fit_system_record_id(df, {}, search_keys, SYSTEM_RECORD_ID)
1986
2007
  df = df.rename(columns={SYSTEM_RECORD_ID: SORT_ID})
1987
- non_keys_columns.append(SORT_ID)
2008
+ features_not_to_pass.append(SORT_ID)
1988
2009
 
1989
2010
  columns_for_system_record_id = sorted(list(search_keys.keys()) + (original_features_for_transform or []))
1990
2011
 
@@ -1992,16 +2013,19 @@ class FeaturesEnricher(TransformerMixin):
1992
2013
  "Float64"
1993
2014
  )
1994
2015
  meaning_types[SYSTEM_RECORD_ID] = FileColumnMeaningType.SYSTEM_RECORD_ID
2016
+ meaning_types[ENTITY_SYSTEM_RECORD_ID] = FileColumnMeaningType.ENTITY_SYSTEM_RECORD_ID
2017
+ if SEARCH_KEY_UNNEST in df.columns:
2018
+ meaning_types[SEARCH_KEY_UNNEST] = FileColumnMeaningType.UNNEST_KEY
1995
2019
 
1996
2020
  df = df.reset_index(drop=True)
1997
- system_columns_with_original_index = [SYSTEM_RECORD_ID] + generated_features
2021
+ system_columns_with_original_index = [SYSTEM_RECORD_ID, ENTITY_SYSTEM_RECORD_ID] + generated_features
1998
2022
  if add_fit_system_record_id:
1999
2023
  system_columns_with_original_index.append(SORT_ID)
2000
2024
  df_with_original_index = df[system_columns_with_original_index].copy()
2001
2025
 
2002
2026
  combined_search_keys = combine_search_keys(search_keys.keys())
2003
2027
 
2004
- df_without_features = df.drop(columns=non_keys_columns)
2028
+ df_without_features = df.drop(columns=features_not_to_pass)
2005
2029
 
2006
2030
  df_without_features = clean_full_duplicates(
2007
2031
  df_without_features, self.logger, silent=silent_mode, bundle=self.bundle
@@ -2013,12 +2037,13 @@ class FeaturesEnricher(TransformerMixin):
2013
2037
  dataset = Dataset(
2014
2038
  "sample_" + str(uuid.uuid4()),
2015
2039
  df=df_without_features,
2040
+ meaning_types=meaning_types,
2041
+ search_keys=combined_search_keys,
2042
+ unnest_search_keys=unnest_search_keys,
2016
2043
  date_format=self.date_format,
2017
2044
  rest_client=self.rest_client,
2018
2045
  logger=self.logger,
2019
2046
  )
2020
- dataset.meaning_types = meaning_types
2021
- dataset.search_keys = combined_search_keys
2022
2047
  if email_converted_to_hem:
2023
2048
  dataset.ignore_columns = [email_column]
2024
2049
 
@@ -2157,6 +2182,14 @@ class FeaturesEnricher(TransformerMixin):
2157
2182
 
2158
2183
  key_types = search_keys.values()
2159
2184
 
2185
+ # Multiple search keys allowed only for PHONE, IP, POSTAL_CODE, EMAIL, HEM
2186
+ multi_keys = [key for key, count in Counter(key_types).items() if count > 1]
2187
+ for multi_key in multi_keys:
2188
+ if multi_key not in [SearchKey.PHONE, SearchKey.IP, SearchKey.POSTAL_CODE, SearchKey.EMAIL, SearchKey.HEM]:
2189
+ msg = self.bundle.get("unsupported_multi_key").format(multi_key)
2190
+ self.logger.warning(msg)
2191
+ raise ValidationError(msg)
2192
+
2160
2193
  if SearchKey.DATE in key_types and SearchKey.DATETIME in key_types:
2161
2194
  msg = self.bundle.get("date_and_datetime_simultanious")
2162
2195
  self.logger.warning(msg)
@@ -2172,11 +2205,11 @@ class FeaturesEnricher(TransformerMixin):
2172
2205
  self.logger.warning(msg)
2173
2206
  raise ValidationError(msg)
2174
2207
 
2175
- for key_type in SearchKey.__members__.values():
2176
- if key_type != SearchKey.CUSTOM_KEY and list(key_types).count(key_type) > 1:
2177
- msg = self.bundle.get("multiple_search_key").format(key_type)
2178
- self.logger.warning(msg)
2179
- raise ValidationError(msg)
2208
+ # for key_type in SearchKey.__members__.values():
2209
+ # if key_type != SearchKey.CUSTOM_KEY and list(key_types).count(key_type) > 1:
2210
+ # msg = self.bundle.get("multiple_search_key").format(key_type)
2211
+ # self.logger.warning(msg)
2212
+ # raise ValidationError(msg)
2180
2213
 
2181
2214
  # non_personal_keys = set(SearchKey.__members__.values()) - set(SearchKey.personal_keys())
2182
2215
  # if (
@@ -2314,14 +2347,7 @@ class FeaturesEnricher(TransformerMixin):
2314
2347
  self.logger.info("Input dataset hasn't date column")
2315
2348
  if self.add_date_if_missing:
2316
2349
  df = self._add_current_date_as_key(df, self.fit_search_keys, self.logger, self.bundle)
2317
- email_column = self._get_email_column(self.fit_search_keys)
2318
- hem_column = self._get_hem_column(self.fit_search_keys)
2319
- email_converted_to_hem = False
2320
- if email_column:
2321
- converter = EmailSearchKeyConverter(email_column, hem_column, self.fit_search_keys, self.logger)
2322
- df = converter.convert(df)
2323
- self.fit_generated_features.extend(converter.generated_features)
2324
- email_converted_to_hem = converter.email_converted_to_hem
2350
+
2325
2351
  if (
2326
2352
  self.detect_missing_search_keys
2327
2353
  and list(self.fit_search_keys.values()) == [SearchKey.DATE]
@@ -2330,7 +2356,37 @@ class FeaturesEnricher(TransformerMixin):
2330
2356
  converter = IpToCountrySearchKeyConverter(self.fit_search_keys, self.logger)
2331
2357
  df = converter.convert(df)
2332
2358
 
2359
+ # Explode multiple search keys
2333
2360
  non_feature_columns = [self.TARGET_NAME, EVAL_SET_INDEX] + list(self.fit_search_keys.keys())
2361
+ meaning_types = {
2362
+ **{col: key.value for col, key in self.fit_search_keys.items()},
2363
+ **{str(c): FileColumnMeaningType.FEATURE for c in df.columns if c not in non_feature_columns},
2364
+ }
2365
+ meaning_types[self.TARGET_NAME] = FileColumnMeaningType.TARGET
2366
+ if eval_set is not None and len(eval_set) > 0:
2367
+ meaning_types[EVAL_SET_INDEX] = FileColumnMeaningType.EVAL_SET_INDEX
2368
+ df = self.__add_fit_system_record_id(df, meaning_types, self.fit_search_keys, ENTITY_SYSTEM_RECORD_ID)
2369
+
2370
+ # TODO check that this is correct for enrichment
2371
+ self.df_with_original_index = df.copy()
2372
+
2373
+ df, unnest_search_keys = self._explode_multiple_search_keys(df, self.fit_search_keys)
2374
+
2375
+ # Convert EMAIL to HEM after unnesting to do it only with one column
2376
+ email_column = self._get_email_column(self.fit_search_keys)
2377
+ hem_column = self._get_hem_column(self.fit_search_keys)
2378
+ email_converted_to_hem = False
2379
+ if email_column:
2380
+ converter = EmailSearchKeyConverter(
2381
+ email_column, hem_column, self.fit_search_keys, list(unnest_search_keys.keys()), self.logger
2382
+ )
2383
+ df = converter.convert(df)
2384
+ self.fit_generated_features.extend(converter.generated_features)
2385
+ email_converted_to_hem = converter.email_converted_to_hem
2386
+
2387
+ non_feature_columns = [self.TARGET_NAME, EVAL_SET_INDEX, ENTITY_SYSTEM_RECORD_ID, SEARCH_KEY_UNNEST] + list(
2388
+ self.fit_search_keys.keys()
2389
+ )
2334
2390
  if email_converted_to_hem:
2335
2391
  non_feature_columns.append(email_column)
2336
2392
  if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
@@ -2354,12 +2410,14 @@ class FeaturesEnricher(TransformerMixin):
2354
2410
  **{str(c): FileColumnMeaningType.FEATURE for c in df.columns if c not in non_feature_columns},
2355
2411
  }
2356
2412
  meaning_types[self.TARGET_NAME] = FileColumnMeaningType.TARGET
2413
+ meaning_types[ENTITY_SYSTEM_RECORD_ID] = FileColumnMeaningType.ENTITY_SYSTEM_RECORD_ID
2414
+ if SEARCH_KEY_UNNEST in df.columns:
2415
+ meaning_types[SEARCH_KEY_UNNEST] = FileColumnMeaningType.UNNEST_KEY
2357
2416
  if eval_set is not None and len(eval_set) > 0:
2358
2417
  meaning_types[EVAL_SET_INDEX] = FileColumnMeaningType.EVAL_SET_INDEX
2359
2418
 
2360
- df = self.__add_fit_system_record_id(df, meaning_types, self.fit_search_keys)
2419
+ df = self.__add_fit_system_record_id(df, meaning_types, self.fit_search_keys, SYSTEM_RECORD_ID)
2361
2420
 
2362
- self.df_with_original_index = df.copy()
2363
2421
  df = df.reset_index(drop=True).sort_values(by=SYSTEM_RECORD_ID).reset_index(drop=True)
2364
2422
 
2365
2423
  combined_search_keys = combine_search_keys(self.fit_search_keys.keys())
@@ -2367,14 +2425,15 @@ class FeaturesEnricher(TransformerMixin):
2367
2425
  dataset = Dataset(
2368
2426
  "tds_" + str(uuid.uuid4()),
2369
2427
  df=df,
2428
+ meaning_types=meaning_types,
2429
+ search_keys=combined_search_keys,
2430
+ unnest_search_keys=unnest_search_keys,
2370
2431
  model_task_type=model_task_type,
2371
2432
  date_format=self.date_format,
2372
2433
  random_state=self.random_state,
2373
2434
  rest_client=self.rest_client,
2374
2435
  logger=self.logger,
2375
2436
  )
2376
- dataset.meaning_types = meaning_types
2377
- dataset.search_keys = combined_search_keys
2378
2437
  if email_converted_to_hem:
2379
2438
  dataset.ignore_columns = [email_column]
2380
2439
 
@@ -2744,9 +2803,10 @@ class FeaturesEnricher(TransformerMixin):
2744
2803
  X: pd.DataFrame, y: pd.Series, cv: Optional[CVType]
2745
2804
  ) -> Tuple[pd.DataFrame, pd.Series]:
2746
2805
  if cv not in [CVType.time_series, CVType.blocked_time_series]:
2806
+ record_id_column = ENTITY_SYSTEM_RECORD_ID if ENTITY_SYSTEM_RECORD_ID in X else SYSTEM_RECORD_ID
2747
2807
  Xy = X.copy()
2748
2808
  Xy[TARGET] = y
2749
- Xy = Xy.sort_values(by=SYSTEM_RECORD_ID).reset_index(drop=True)
2809
+ Xy = Xy.sort_values(by=record_id_column).reset_index(drop=True)
2750
2810
  X = Xy.drop(columns=TARGET)
2751
2811
  y = Xy[TARGET].copy()
2752
2812
 
@@ -2925,15 +2985,19 @@ class FeaturesEnricher(TransformerMixin):
2925
2985
 
2926
2986
  @staticmethod
2927
2987
  def _get_email_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
2928
- for col, t in search_keys.items():
2929
- if t == SearchKey.EMAIL:
2930
- return col
2988
+ cols = [col for col, t in search_keys.items() if t == SearchKey.EMAIL]
2989
+ if len(cols) > 1:
2990
+ raise Exception("More than one email column found after unnest")
2991
+ if len(cols) == 1:
2992
+ return cols[0]
2931
2993
 
2932
2994
  @staticmethod
2933
2995
  def _get_hem_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
2934
- for col, t in search_keys.items():
2935
- if t == SearchKey.HEM:
2936
- return col
2996
+ cols = [col for col, t in search_keys.items() if t == SearchKey.HEM]
2997
+ if len(cols) > 1:
2998
+ raise Exception("More than one hem column found after unnest")
2999
+ if len(cols) == 1:
3000
+ return cols[0]
2937
3001
 
2938
3002
  @staticmethod
2939
3003
  def _get_phone_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
@@ -2941,8 +3005,44 @@ class FeaturesEnricher(TransformerMixin):
2941
3005
  if t == SearchKey.PHONE:
2942
3006
  return col
2943
3007
 
3008
+ def _explode_multiple_search_keys(
3009
+ self, df: pd.DataFrame, search_keys: Dict[str, SearchKey]
3010
+ ) -> Tuple[pd.DataFrame, Dict[str, List[str]]]:
3011
+ # find groups of multiple search keys
3012
+ search_key_names_by_type: Dict[SearchKey, str] = {}
3013
+ for key_name, key_type in search_keys.items():
3014
+ search_key_names_by_type[key_type] = search_key_names_by_type.get(key_type, []) + [key_name]
3015
+ search_key_names_by_type = {
3016
+ key_type: key_names for key_type, key_names in search_key_names_by_type.items() if len(key_names) > 1
3017
+ }
3018
+ if len(search_key_names_by_type) == 0:
3019
+ return df, {}
3020
+
3021
+ multiple_keys_columns = [col for cols in search_key_names_by_type.values() for col in cols]
3022
+ other_columns = [col for col in df.columns if col not in multiple_keys_columns]
3023
+ exploded_dfs = []
3024
+ unnest_search_keys = {}
3025
+
3026
+ for key_type, key_names in search_key_names_by_type.items():
3027
+ new_search_key = f"upgini_{key_type.name.lower()}_unnest"
3028
+ exploded_df = pd.melt(
3029
+ df, id_vars=other_columns, value_vars=key_names, var_name=SEARCH_KEY_UNNEST, value_name=new_search_key
3030
+ )
3031
+ exploded_dfs.append(exploded_df)
3032
+ for old_key in key_names:
3033
+ del search_keys[old_key]
3034
+ search_keys[new_search_key] = key_type
3035
+ unnest_search_keys[new_search_key] = key_names
3036
+
3037
+ df = pd.concat(exploded_dfs, ignore_index=True)
3038
+ return df, unnest_search_keys
3039
+
2944
3040
  def __add_fit_system_record_id(
2945
- self, df: pd.DataFrame, meaning_types: Dict[str, FileColumnMeaningType], search_keys: Dict[str, SearchKey]
3041
+ self,
3042
+ df: pd.DataFrame,
3043
+ meaning_types: Dict[str, FileColumnMeaningType],
3044
+ search_keys: Dict[str, SearchKey],
3045
+ id_name: str,
2946
3046
  ) -> pd.DataFrame:
2947
3047
  # save original order or rows
2948
3048
  original_index_name = df.index.name
@@ -2953,7 +3053,14 @@ class FeaturesEnricher(TransformerMixin):
2953
3053
 
2954
3054
  # order by date and idempotent order by other keys
2955
3055
  if self.cv not in [CVType.time_series, CVType.blocked_time_series]:
2956
- sort_exclude_columns = [original_order_name, ORIGINAL_INDEX, EVAL_SET_INDEX, TARGET, "__target"]
3056
+ sort_exclude_columns = [
3057
+ original_order_name,
3058
+ ORIGINAL_INDEX,
3059
+ EVAL_SET_INDEX,
3060
+ TARGET,
3061
+ "__target",
3062
+ ENTITY_SYSTEM_RECORD_ID,
3063
+ ]
2957
3064
  if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
2958
3065
  date_column = DateTimeSearchKeyConverter.DATETIME_COL
2959
3066
  sort_exclude_columns.append(self._get_date_column(search_keys))
@@ -2991,14 +3098,18 @@ class FeaturesEnricher(TransformerMixin):
2991
3098
 
2992
3099
  df = df.reset_index(drop=True).reset_index()
2993
3100
  # system_record_id saves correct order for fit
2994
- df = df.rename(columns={DEFAULT_INDEX: SYSTEM_RECORD_ID})
3101
+ df = df.rename(columns={DEFAULT_INDEX: id_name})
2995
3102
 
2996
3103
  # return original order
2997
3104
  df = df.set_index(ORIGINAL_INDEX)
2998
3105
  df.index.name = original_index_name
2999
3106
  df = df.sort_values(by=original_order_name).drop(columns=original_order_name)
3000
3107
 
3001
- meaning_types[SYSTEM_RECORD_ID] = FileColumnMeaningType.SYSTEM_RECORD_ID
3108
+ meaning_types[id_name] = (
3109
+ FileColumnMeaningType.SYSTEM_RECORD_ID
3110
+ if id_name == SYSTEM_RECORD_ID
3111
+ else FileColumnMeaningType.ENTITY_SYSTEM_RECORD_ID
3112
+ )
3002
3113
  return df
3003
3114
 
3004
3115
  def __correct_target(self, df: pd.DataFrame) -> pd.DataFrame:
@@ -3053,7 +3164,11 @@ class FeaturesEnricher(TransformerMixin):
3053
3164
  )
3054
3165
 
3055
3166
  comparing_columns = X.columns if is_transform else df_with_original_index.columns
3056
- dup_features = [c for c in comparing_columns if c in result_features.columns and c != SYSTEM_RECORD_ID]
3167
+ dup_features = [
3168
+ c
3169
+ for c in comparing_columns
3170
+ if c in result_features.columns and c not in [SYSTEM_RECORD_ID, ENTITY_SYSTEM_RECORD_ID]
3171
+ ]
3057
3172
  if len(dup_features) > 0:
3058
3173
  self.logger.warning(f"X contain columns with same name as returned from backend: {dup_features}")
3059
3174
  raise ValidationError(self.bundle.get("returned_features_same_as_passed").format(dup_features))
@@ -3064,8 +3179,7 @@ class FeaturesEnricher(TransformerMixin):
3064
3179
  result_features = pd.merge(
3065
3180
  df_with_original_index,
3066
3181
  result_features,
3067
- left_on=SYSTEM_RECORD_ID,
3068
- right_on=SYSTEM_RECORD_ID,
3182
+ on=ENTITY_SYSTEM_RECORD_ID,
3069
3183
  how="left" if is_transform else "inner",
3070
3184
  )
3071
3185
  result_features = result_features.set_index(original_index_name or DEFAULT_INDEX)
@@ -3076,7 +3190,7 @@ class FeaturesEnricher(TransformerMixin):
3076
3190
  result_features = result_features[~result_features[SYSTEM_RECORD_ID].isin(rows_to_drop[SYSTEM_RECORD_ID])]
3077
3191
  self.logger.info(f"After dropping target outliers size: {len(result_features)}")
3078
3192
 
3079
- result_eval_sets = dict()
3193
+ result_eval_sets = {}
3080
3194
  if not is_transform and EVAL_SET_INDEX in result_features.columns:
3081
3195
  result_train_features = result_features.loc[result_features[EVAL_SET_INDEX] == 0].copy()
3082
3196
  eval_set_indices = list(result_features[EVAL_SET_INDEX].unique())
@@ -3288,7 +3402,7 @@ class FeaturesEnricher(TransformerMixin):
3288
3402
  if autofe_feature.op.is_vector:
3289
3403
  continue
3290
3404
 
3291
- description = dict()
3405
+ description = {}
3292
3406
 
3293
3407
  feature_meta = get_feature_by_name(autofe_feature.get_display_name(shorten=True))
3294
3408
  if feature_meta is None:
@@ -3454,13 +3568,13 @@ class FeaturesEnricher(TransformerMixin):
3454
3568
  self.warning_counter.increment()
3455
3569
 
3456
3570
  if len(valid_search_keys) == 1:
3457
- for k, v in valid_search_keys.items():
3458
- # Show warning for country only if country is the only key
3459
- if x[k].nunique() == 1 and (v != SearchKey.COUNTRY or len(valid_search_keys) == 1):
3460
- msg = self.bundle.get("single_constant_search_key").format(v, x[k].values[0])
3461
- print(msg)
3462
- self.logger.warning(msg)
3463
- self.warning_counter.increment()
3571
+ key, value = list(valid_search_keys.items())[0]
3572
+ # Show warning for country only if country is the only key
3573
+ if x[key].nunique() == 1:
3574
+ msg = self.bundle.get("single_constant_search_key").format(value, x[key].values[0])
3575
+ print(msg)
3576
+ self.logger.warning(msg)
3577
+ self.warning_counter.increment()
3464
3578
 
3465
3579
  self.logger.info(f"Prepared search keys: {valid_search_keys}")
3466
3580
 
@@ -3570,61 +3684,68 @@ class FeaturesEnricher(TransformerMixin):
3570
3684
  def check_need_detect(search_key: SearchKey):
3571
3685
  return not is_transform or search_key in self.fit_search_keys.values()
3572
3686
 
3573
- if SearchKey.POSTAL_CODE not in search_keys.values() and check_need_detect(SearchKey.POSTAL_CODE):
3574
- maybe_key = PostalCodeSearchKeyDetector().get_search_key_column(sample)
3575
- if maybe_key is not None:
3576
- search_keys[maybe_key] = SearchKey.POSTAL_CODE
3577
- self.autodetected_search_keys[maybe_key] = SearchKey.POSTAL_CODE
3578
- self.logger.info(f"Autodetected search key POSTAL_CODE in column {maybe_key}")
3687
+ # if SearchKey.POSTAL_CODE not in search_keys.values() and check_need_detect(SearchKey.POSTAL_CODE):
3688
+ if check_need_detect(SearchKey.POSTAL_CODE):
3689
+ maybe_keys = PostalCodeSearchKeyDetector().get_search_key_columns(sample, search_keys)
3690
+ if maybe_keys:
3691
+ new_keys = {key: SearchKey.POSTAL_CODE for key in maybe_keys}
3692
+ search_keys.update(new_keys)
3693
+ self.autodetected_search_keys.update(new_keys)
3694
+ self.logger.info(f"Autodetected search key POSTAL_CODE in column {maybe_keys}")
3579
3695
  if not silent_mode:
3580
- print(self.bundle.get("postal_code_detected").format(maybe_key))
3696
+ print(self.bundle.get("postal_code_detected").format(maybe_keys))
3581
3697
 
3582
3698
  if (
3583
3699
  SearchKey.COUNTRY not in search_keys.values()
3584
3700
  and self.country_code is None
3585
3701
  and check_need_detect(SearchKey.COUNTRY)
3586
3702
  ):
3587
- maybe_key = CountrySearchKeyDetector().get_search_key_column(sample)
3588
- if maybe_key is not None:
3589
- search_keys[maybe_key] = SearchKey.COUNTRY
3590
- self.autodetected_search_keys[maybe_key] = SearchKey.COUNTRY
3703
+ maybe_key = CountrySearchKeyDetector().get_search_key_columns(sample, search_keys)
3704
+ if maybe_key:
3705
+ search_keys[maybe_key[0]] = SearchKey.COUNTRY
3706
+ self.autodetected_search_keys[maybe_key[0]] = SearchKey.COUNTRY
3591
3707
  self.logger.info(f"Autodetected search key COUNTRY in column {maybe_key}")
3592
3708
  if not silent_mode:
3593
3709
  print(self.bundle.get("country_detected").format(maybe_key))
3594
3710
 
3595
3711
  if (
3596
- SearchKey.EMAIL not in search_keys.values()
3597
- and SearchKey.HEM not in search_keys.values()
3712
+ # SearchKey.EMAIL not in search_keys.values()
3713
+ SearchKey.HEM not in search_keys.values()
3598
3714
  and check_need_detect(SearchKey.HEM)
3599
3715
  ):
3600
- maybe_key = EmailSearchKeyDetector().get_search_key_column(sample)
3601
- if maybe_key is not None and maybe_key not in search_keys.keys():
3716
+ maybe_keys = EmailSearchKeyDetector().get_search_key_columns(sample, search_keys)
3717
+ if maybe_keys:
3602
3718
  if self.__is_registered or is_demo_dataset:
3603
- search_keys[maybe_key] = SearchKey.EMAIL
3604
- self.autodetected_search_keys[maybe_key] = SearchKey.EMAIL
3605
- self.logger.info(f"Autodetected search key EMAIL in column {maybe_key}")
3719
+ new_keys = {key: SearchKey.EMAIL for key in maybe_keys}
3720
+ search_keys.update(new_keys)
3721
+ self.autodetected_search_keys.update(new_keys)
3722
+ self.logger.info(f"Autodetected search key EMAIL in column {maybe_keys}")
3606
3723
  if not silent_mode:
3607
- print(self.bundle.get("email_detected").format(maybe_key))
3724
+ print(self.bundle.get("email_detected").format(maybe_keys))
3608
3725
  else:
3609
3726
  self.logger.warning(
3610
- f"Autodetected search key EMAIL in column {maybe_key}. But not used because not registered user"
3727
+ f"Autodetected search key EMAIL in column {maybe_keys}."
3728
+ " But not used because not registered user"
3611
3729
  )
3612
3730
  if not silent_mode:
3613
- print(self.bundle.get("email_detected_not_registered").format(maybe_key))
3731
+ print(self.bundle.get("email_detected_not_registered").format(maybe_keys))
3614
3732
  self.warning_counter.increment()
3615
3733
 
3616
- if SearchKey.PHONE not in search_keys.values() and check_need_detect(SearchKey.PHONE):
3617
- maybe_key = PhoneSearchKeyDetector().get_search_key_column(sample)
3618
- if maybe_key is not None and maybe_key not in search_keys.keys():
3734
+ # if SearchKey.PHONE not in search_keys.values() and check_need_detect(SearchKey.PHONE):
3735
+ if check_need_detect(SearchKey.PHONE):
3736
+ maybe_keys = PhoneSearchKeyDetector().get_search_key_columns(sample, search_keys)
3737
+ if maybe_keys:
3619
3738
  if self.__is_registered or is_demo_dataset:
3620
- search_keys[maybe_key] = SearchKey.PHONE
3621
- self.autodetected_search_keys[maybe_key] = SearchKey.PHONE
3622
- self.logger.info(f"Autodetected search key PHONE in column {maybe_key}")
3739
+ new_keys = {key: SearchKey.PHONE for key in maybe_keys}
3740
+ search_keys.update(new_keys)
3741
+ self.autodetected_search_keys.update(new_keys)
3742
+ self.logger.info(f"Autodetected search key PHONE in column {maybe_keys}")
3623
3743
  if not silent_mode:
3624
- print(self.bundle.get("phone_detected").format(maybe_key))
3744
+ print(self.bundle.get("phone_detected").format(maybe_keys))
3625
3745
  else:
3626
3746
  self.logger.warning(
3627
- f"Autodetected search key PHONE in column {maybe_key}. But not used because not registered user"
3747
+ f"Autodetected search key PHONE in column {maybe_keys}. "
3748
+ "But not used because not registered user"
3628
3749
  )
3629
3750
  if not silent_mode:
3630
3751
  print(self.bundle.get("phone_detected_not_registered"))
upgini/metadata.py CHANGED
@@ -6,6 +6,8 @@ from typing import Dict, List, Optional, Set
6
6
  from pydantic import BaseModel
7
7
 
8
8
  SYSTEM_RECORD_ID = "system_record_id"
9
+ ENTITY_SYSTEM_RECORD_ID = "entity_system_record_id"
10
+ SEARCH_KEY_UNNEST = "search_key_unnest"
9
11
  SORT_ID = "sort_id"
10
12
  EVAL_SET_INDEX = "eval_set_index"
11
13
  TARGET = "target"
@@ -13,7 +15,7 @@ COUNTRY = "country_iso_code"
13
15
  RENAMED_INDEX = "index_col"
14
16
  DEFAULT_INDEX = "index"
15
17
  ORIGINAL_INDEX = "original_index"
16
- SYSTEM_COLUMNS = {SYSTEM_RECORD_ID, EVAL_SET_INDEX, TARGET, COUNTRY, SORT_ID}
18
+ SYSTEM_COLUMNS = {SYSTEM_RECORD_ID, ENTITY_SYSTEM_RECORD_ID, SEARCH_KEY_UNNEST, EVAL_SET_INDEX, TARGET, COUNTRY}
17
19
 
18
20
 
19
21
  class FileColumnMeaningType(Enum):
@@ -39,6 +41,8 @@ class FileColumnMeaningType(Enum):
39
41
  POSTAL_CODE = "POSTAL_CODE"
40
42
  SYSTEM_RECORD_ID = "SYSTEM_RECORD_ID"
41
43
  EVAL_SET_INDEX = "EVAL_SET_INDEX"
44
+ ENTITY_SYSTEM_RECORD_ID = "ENTITY_SYSTEM_RECORD_ID"
45
+ UNNEST_KEY = "UNNEST_KEY"
42
46
 
43
47
 
44
48
  class SearchKey(Enum):
@@ -184,6 +188,10 @@ class FileColumnMetadata(BaseModel):
184
188
  meaningType: FileColumnMeaningType
185
189
  minMaxValues: Optional[NumericInterval] = None
186
190
  originalName: Optional[str]
191
+ # is this column contains keys from multiple key columns like msisdn1, msisdn2
192
+ isUnnest: bool = False
193
+ # list of original etalon key column names like msisdn1, msisdn2
194
+ unnestKeyNames: Optional[list[str]]
187
195
 
188
196
 
189
197
  class FileMetadata(BaseModel):
@@ -281,7 +289,7 @@ class FeaturesFilter(BaseModel):
281
289
 
282
290
 
283
291
  class RuntimeParameters(BaseModel):
284
- properties: Dict[str, str] = dict()
292
+ properties: Dict[str, str] = {}
285
293
 
286
294
 
287
295
  class SearchCustomization(BaseModel):
upgini/metrics.py CHANGED
@@ -369,7 +369,7 @@ class EstimatorWrapper:
369
369
  "logger": logger,
370
370
  }
371
371
  if estimator is None:
372
- params = dict()
372
+ params = {}
373
373
  params["has_time"] = has_date
374
374
  # if metric_name.upper() in SUPPORTED_CATBOOST_METRICS:
375
375
  # params["eval_metric"] = SUPPORTED_CATBOOST_METRICS[metric_name.upper()]
@@ -88,6 +88,7 @@ unsupported_search_key_type=Unsupported type of key in search_keys: {}
88
88
  search_key_country_and_country_code=\nWARNING: SearchKey.COUNTRY and country_code parameter were passed simultaniously. Parameter country_code will be ignored
89
89
  empty_search_key=Search key {} is empty. Please fill values or remove this search key
90
90
  single_constant_search_key=\nWARNING: Constant value detected for the {} search key in the X dataframe: {}.\nThat search key will add constant features for different y values.\nPlease add extra search keys with non constant values, like the COUNTRY, POSTAL_CODE, DATE, PHONE NUMBER, EMAIL/HEM or IPv4
91
+ unsupported_multi_key=Search key {} cannot be used multiple times
91
92
  unsupported_index_column=\nWARNING: Your column with name `index` was dropped because it's reserved name is booked for system needs.
92
93
  date_string_without_format=Date column `{}` has string type, but date_format is not specified. Convert column to datetime type or pass date_format
93
94
  invalid_date_format=Failed to parse date in column `{}`. Try to pass explicit date format in date_format argument of FeaturesEnricher constructor
@@ -1,4 +1,4 @@
1
- from typing import List, Optional
1
+ from typing import List
2
2
 
3
3
  import pandas as pd
4
4
 
@@ -10,16 +10,18 @@ class BaseSearchKeyDetector:
10
10
  def _is_search_key_by_values(self, column: pd.Series) -> bool:
11
11
  raise NotImplementedError
12
12
 
13
- def _get_search_key_by_name(self, column_names: List[str]) -> Optional[str]:
14
- for column_name in column_names:
15
- if self._is_search_key_by_name(column_name):
16
- return column_name
13
+ def _get_search_keys_by_name(self, column_names: List[str]) -> List[str]:
14
+ return [
15
+ column_name
16
+ for column_name in column_names
17
+ if self._is_search_key_by_name(column_name)
18
+ ]
17
19
 
18
- def get_search_key_column(self, df: pd.DataFrame) -> Optional[str]:
19
- maybe_column = self._get_search_key_by_name(df.columns.to_list())
20
- if maybe_column is not None:
21
- return maybe_column
22
-
23
- for column_name in df.columns:
20
+ def get_search_key_columns(self, df: pd.DataFrame, existing_search_keys: List[str]) -> List[str]:
21
+ other_columns = [col for col in df.columns if col not in existing_search_keys]
22
+ columns_by_names = self._get_search_keys_by_name(other_columns)
23
+ columns_by_values = []
24
+ for column_name in other_columns:
24
25
  if self._is_search_key_by_values(df[column_name]):
25
- return column_name
26
+ columns_by_values.append(column_name)
27
+ return list(set(columns_by_names + columns_by_values))
@@ -3,7 +3,15 @@ from typing import Dict, List, Optional, Union
3
3
 
4
4
  import pandas as pd
5
5
 
6
- from upgini.metadata import EVAL_SET_INDEX, SORT_ID, SYSTEM_RECORD_ID, TARGET, ModelTaskType, SearchKey
6
+ from upgini.metadata import (
7
+ ENTITY_SYSTEM_RECORD_ID,
8
+ EVAL_SET_INDEX,
9
+ SORT_ID,
10
+ SYSTEM_RECORD_ID,
11
+ TARGET,
12
+ ModelTaskType,
13
+ SearchKey,
14
+ )
7
15
  from upgini.resource_bundle import ResourceBundle
8
16
  from upgini.utils.datetime_utils import DateTimeSearchKeyConverter
9
17
  from upgini.utils.target_utils import define_task
@@ -143,6 +151,8 @@ def clean_full_duplicates(
143
151
  unique_columns = df.columns.tolist()
144
152
  if SYSTEM_RECORD_ID in unique_columns:
145
153
  unique_columns.remove(SYSTEM_RECORD_ID)
154
+ if ENTITY_SYSTEM_RECORD_ID in unique_columns:
155
+ unique_columns.remove(ENTITY_SYSTEM_RECORD_ID)
146
156
  if SORT_ID in unique_columns:
147
157
  unique_columns.remove(SORT_ID)
148
158
  if EVAL_SET_INDEX in unique_columns:
@@ -38,11 +38,13 @@ class EmailSearchKeyConverter:
38
38
  email_column: str,
39
39
  hem_column: Optional[str],
40
40
  search_keys: Dict[str, SearchKey],
41
+ unnest_search_keys: Optional[List[str]] = None,
41
42
  logger: Optional[logging.Logger] = None,
42
43
  ):
43
44
  self.email_column = email_column
44
45
  self.hem_column = hem_column
45
46
  self.search_keys = search_keys
47
+ self.unnest_search_keys = unnest_search_keys
46
48
  if logger is not None:
47
49
  self.logger = logger
48
50
  else:
@@ -80,9 +82,12 @@ class EmailSearchKeyConverter:
80
82
  del self.search_keys[self.email_column]
81
83
  return df
82
84
  self.search_keys[self.HEM_COLUMN_NAME] = SearchKey.HEM
85
+ self.unnest_search_keys.append(self.HEM_COLUMN_NAME)
83
86
  self.email_converted_to_hem = True
84
87
 
85
88
  del self.search_keys[self.email_column]
89
+ if self.email_column in self.unnest_search_keys:
90
+ self.unnest_search_keys.remove(self.email_column)
86
91
 
87
92
  df[self.EMAIL_ONE_DOMAIN_COLUMN_NAME] = df[self.email_column].apply(self._email_to_one_domain)
88
93
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: upgini
3
- Version: 1.1.297
3
+ Version: 1.1.299
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -131,7 +131,7 @@ Description-Content-Type: text/markdown
131
131
  |Consumer Confidence index| 44 |22|-|Monthly|date, country|No
132
132
  |World economic indicators|191 |41|-|Monthly|date, country|No
133
133
  |Markets data|-|17|-|Monthly|date, datetime|No
134
- |World mobile & fixed broadband network coverage and perfomance |167|-|3|Monthly|country, postal/ZIP code|No
134
+ |World mobile & fixed broadband network coverage and performance |167|-|3|Monthly|country, postal/ZIP code|No
135
135
  |World demographic data |90|-|2|Annual|country, postal/ZIP code|No
136
136
  |World house prices |44|-|3|Annual|country, postal/ZIP code|No
137
137
  |Public social media profile data |104|-|-|Monthly|date, email/HEM, phone |Yes
@@ -840,4 +840,4 @@ Some convenient ways to start contributing are:
840
840
  - [More perks for registered users](https://profile.upgini.com)
841
841
 
842
842
  <sup>😔 Found mistype or a bug in code snippet? Our bad! <a href="https://github.com/upgini/upgini/issues/new?assignees=&title=readme%2Fbug">
843
- Please report it here</a></sup>
843
+ Please report it here</a></sup>
@@ -1,23 +1,23 @@
1
- upgini/__about__.py,sha256=wXLUjYbwCXLCqMCPB8OqSFq3oPafpNboNnGSVzAPaUk,24
1
+ upgini/__about__.py,sha256=UQSkVroAOLINihC2MNtqGFKJfbFI7X9Lopc0G3W2z0I,24
2
2
  upgini/__init__.py,sha256=ObEtjFkIssl83qeKNMLpIQygfwK8TzztwiI43YTsAP0,353
3
3
  upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
4
- upgini/dataset.py,sha256=7TLVVhGtjgx_9yaiaIUK3kZSe_R9wg5dY0d4F5qCGM4,45636
4
+ upgini/dataset.py,sha256=MOzBVsvzlHLxNfPWtMaXC_jIPeW7_gUvbSGeXnsPgNI,46158
5
5
  upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
6
- upgini/features_enricher.py,sha256=HQFLw3VyEsZfAt4xFnIYOnp3fzQSHAsyHzIm0gTJpOI,177543
6
+ upgini/features_enricher.py,sha256=afPo71dYHp9edimm3TWeAzr6aY0sPYRkvdqR34cQP9A,183157
7
7
  upgini/http.py,sha256=bp6jWl422Icy3AhHMdCcJv5NjExE45gSMmzMTPJjPuk,42600
8
8
  upgini/lazy_import.py,sha256=EwoM0msNGbSmWBhGbrLDny1DSnOlvTxCjmMKPxYlDms,610
9
- upgini/metadata.py,sha256=qDAIO7NLSSQp_XiXCv3U4XJTLO0KH3YuQ8lvCLYPqzs,9781
10
- upgini/metrics.py,sha256=DiDgdFvYu64ArlPEgjppZShK6yybWtIEbdPAhI3yO1I,30930
9
+ upgini/metadata.py,sha256=wOFCJruDBhC4Hiiiqf8GeHZnnm6rhJy8t6fg5B0Z4TQ,10209
10
+ upgini/metrics.py,sha256=Tu5cN8RlhOSSMWUTXRSkdl8SWBqR1N_2eJpBum9pZxc,30926
11
11
  upgini/search_task.py,sha256=LtRJ9bCPjMo1gJ-sUDKERhDwGcWKImrzwVFHjkMSQHQ,17071
12
12
  upgini/spinner.py,sha256=4iMd-eIe_BnkqFEMIliULTbj6rNI2HkN_VJ4qYe0cUc,1118
13
13
  upgini/version_validator.py,sha256=ddSKUK_-eGJB3NgrqOMoWJU-OxQ253WsNLp8aqJkaIM,1389
14
14
  upgini/ads_management/__init__.py,sha256=qzyisOToVRP-tquAJD1PblZhNtMrOB8FiyF9JvfkvgE,50
15
15
  upgini/ads_management/ads_manager.py,sha256=igVbN2jz80Umb2BUJixmJVj-zx8unoKpecVo-R-nGdw,2648
16
16
  upgini/autofe/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
17
- upgini/autofe/all_operands.py,sha256=cpwUfhZWF9QBfrUyJ0xZ72iGYyt1eXIZQ46FB-7ZDI4,2421
17
+ upgini/autofe/all_operands.py,sha256=XbvgX2IU4aee9rJZ--d5MdmrfKhON_emle5-RU1qlEY,2506
18
18
  upgini/autofe/binary.py,sha256=8FXPJxN7fnC5wphO0Dp1tQCa0lFMSDGQGvBMkSIVAcE,4155
19
- upgini/autofe/date.py,sha256=qzk0NT332Q0vR1eRwTuNiMSrGE3ulh6Ic3QLBZqSdvw,7284
20
- upgini/autofe/feature.py,sha256=_V9B74B3ue7eAYXSOt9JKhVC9klkAKks22MwnBRye_w,12487
19
+ upgini/autofe/date.py,sha256=8zYVhjl7jVS4xt-IjCgk9px2LHnACX2YlMlmDELlRTc,7943
20
+ upgini/autofe/feature.py,sha256=ayxiF8Ip1ww_pt_BC9Pk127fAHZ_3fuluulS1EYLolk,13423
21
21
  upgini/autofe/groupby.py,sha256=4WjDzQxqpZxB79Ih4ihMMI5GDxaFqiH6ZelfV82ClT4,3091
22
22
  upgini/autofe/operand.py,sha256=MKEsl3zxpWzRDpTkE0sNJxTu62U20sWOvEKhPjUWS6s,2915
23
23
  upgini/autofe/unary.py,sha256=ZWjLd-CUkNt_PpM8YuWLLipW1v_RdBlsl4JxXIVo9aM,3652
@@ -30,22 +30,22 @@ upgini/normalizer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU
30
30
  upgini/normalizer/phone_normalizer.py,sha256=EzTaahk6myRv6ZXgbyVFGY4kpo_2VlQgOrm5_lfbmNI,9996
31
31
  upgini/resource_bundle/__init__.py,sha256=S5F2G47pnJd2LDpmFsjDqEwiKkP8Hm-hcseDbMka6Ko,8345
32
32
  upgini/resource_bundle/exceptions.py,sha256=5fRvx0_vWdE1-7HcSgF0tckB4A9AKyf5RiinZkInTsI,621
33
- upgini/resource_bundle/strings.properties,sha256=1oHurL4I83P2lXIavx9vSdKM8ZqncAPXH2IZf76bD6g,26292
33
+ upgini/resource_bundle/strings.properties,sha256=6jYqcxj06ZopXwr5YYMGXX1QiNNJNFo2SuwAR0qleRk,26358
34
34
  upgini/resource_bundle/strings_widget.properties,sha256=gOdqvZWntP2LCza_tyVk1_yRYcG4c04K9sQOAVhF_gw,1577
35
35
  upgini/sampler/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
36
36
  upgini/sampler/base.py,sha256=7GpjYqjOp58vYcJLiX__1R5wjUlyQbxvHJ2klFnup_M,6389
37
37
  upgini/sampler/random_under_sampler.py,sha256=TIbm7ATo-bCMF-IiS5sZeDC1ad1SYg0eY_rRmg84yIQ,4024
38
38
  upgini/sampler/utils.py,sha256=PYOk3kKSnFlyxcpdtDNLBEEhTB4lO_iP7pQHqeUcmAc,20211
39
39
  upgini/utils/__init__.py,sha256=O_KgzKiJjW3g4NoqZ7lAxUpoHcBi_gze6r3ndEjCH74,842
40
- upgini/utils/base_search_key_detector.py,sha256=UNs2uxEcD1N_mOtkx3k6U70DCajW-QEO2vZp41GF0mU,855
40
+ upgini/utils/base_search_key_detector.py,sha256=Inc6iGG-VXQdejWFfbekIkZk2ahC4k7CdGqzOkie6Bs,1021
41
41
  upgini/utils/blocked_time_series.py,sha256=Uqr3vp4YqNclj2-PzEYqVy763GSXHn86sbpIl1UOB4s,3382
42
42
  upgini/utils/country_utils.py,sha256=yE8oRgMpXuJxPfQm4fioY6dg6700HgVnHSk4Cv9sUyM,6511
43
43
  upgini/utils/custom_loss_utils.py,sha256=kieNZYBYZm5ZGBltF1F_jOSF4ea6C29rYuCyiDcqVNY,3857
44
44
  upgini/utils/cv_utils.py,sha256=w6FQb9nO8BWDx88EF83NpjPLarK4eR4ia0Wg0kLBJC4,3525
45
45
  upgini/utils/datetime_utils.py,sha256=Ujmu1ouwSFtG5SywQXJlmtDnGigAnIWPdE5Vx5NvgUM,10951
46
- upgini/utils/deduplicate_utils.py,sha256=6AbARehUCghJZ4PppFtrej2s3gFRruh41MEm6mzakHs,8607
46
+ upgini/utils/deduplicate_utils.py,sha256=Zvs7zW4QzaERQmJNPrTVf2ZTVBkBLOycFCzyMwtXuV8,8770
47
47
  upgini/utils/display_utils.py,sha256=A2ouB5eiZ-Kyt9ykYxkLQwyoRPrdYeJymwNTiajtFXs,10990
48
- upgini/utils/email_utils.py,sha256=PLufTO97Pg9PPsNqB9agcM6M98MIxKUgIgNn2mVwSQ0,3520
48
+ upgini/utils/email_utils.py,sha256=aKHa4xVBSsEsiZtFCPj_DrUaFupceYfvJeP_e8w_D5E,3813
49
49
  upgini/utils/fallback_progress_bar.py,sha256=PDaKb8dYpVZaWMroNcOHsTc3pSjgi9mOm0--cOFTwJ0,1074
50
50
  upgini/utils/features_validator.py,sha256=PgKNt5dyqfErTvjtRNNUS9g7GFqHBtAtnsfA-V5UO1A,3307
51
51
  upgini/utils/format.py,sha256=Yv5cvvSs2bOLUzzNu96Pu33VMDNbabio92QepUj41jU,243
@@ -57,7 +57,7 @@ upgini/utils/sklearn_ext.py,sha256=13jQS_k7v0aUtudXV6nGUEWjttPQzAW9AFYL5wgEz9k,4
57
57
  upgini/utils/target_utils.py,sha256=Y96_PJ5cC-WsEbeqg20v9uqywDQobLoTb-xoP7S3o4E,7807
58
58
  upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
59
59
  upgini/utils/warning_counter.py,sha256=dIWBB4dI5XRRJZudvIlqlIYKEiwLLPcXarsZuYRt338,227
60
- upgini-1.1.297.dist-info/METADATA,sha256=RwOihmiasIoIOFrOuY-WcLPOc4Fgt5QlztJGwCg5QQ8,48151
61
- upgini-1.1.297.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
62
- upgini-1.1.297.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
63
- upgini-1.1.297.dist-info/RECORD,,
60
+ upgini-1.1.299.dist-info/METADATA,sha256=IITBjTPICcuZmOUmDbmU_GdWOLVSSrDw_llm2mHGK3A,48153
61
+ upgini-1.1.299.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
62
+ upgini-1.1.299.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
63
+ upgini-1.1.299.dist-info/RECORD,,