upgini 1.1.275__py3-none-any.whl → 1.1.275a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

@@ -1,5 +1,4 @@
1
1
  import dataclasses
2
- import datetime
3
2
  import gc
4
3
  import hashlib
5
4
  import itertools
@@ -11,6 +10,7 @@ import sys
11
10
  import tempfile
12
11
  import time
13
12
  import uuid
13
+ from collections import Counter
14
14
  from dataclasses import dataclass
15
15
  from threading import Thread
16
16
  from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union
@@ -21,7 +21,6 @@ from pandas.api.types import (
21
21
  is_bool,
22
22
  is_datetime64_any_dtype,
23
23
  is_numeric_dtype,
24
- is_object_dtype,
25
24
  is_period_dtype,
26
25
  is_string_dtype,
27
26
  )
@@ -45,9 +44,11 @@ from upgini.mdc import MDC
45
44
  from upgini.metadata import (
46
45
  COUNTRY,
47
46
  DEFAULT_INDEX,
47
+ ENTITY_SYSTEM_RECORD_ID,
48
48
  EVAL_SET_INDEX,
49
49
  ORIGINAL_INDEX,
50
50
  RENAMED_INDEX,
51
+ SEARCH_KEY_UNNEST,
51
52
  SORT_ID,
52
53
  SYSTEM_RECORD_ID,
53
54
  TARGET,
@@ -148,7 +149,6 @@ class FeaturesEnricher(TransformerMixin):
148
149
  """
149
150
 
150
151
  TARGET_NAME = "target"
151
- CURRENT_DATE = "current_date"
152
152
  RANDOM_STATE = 42
153
153
  CALCULATE_METRICS_THRESHOLD = 50_000_000
154
154
  CALCULATE_METRICS_MIN_THRESHOLD = 500
@@ -210,7 +210,6 @@ class FeaturesEnricher(TransformerMixin):
210
210
  client_ip: Optional[str] = None,
211
211
  client_visitorid: Optional[str] = None,
212
212
  custom_bundle_config: Optional[str] = None,
213
- add_date_if_missing: bool = True,
214
213
  **kwargs,
215
214
  ):
216
215
  self.bundle = get_custom_bundle(custom_bundle_config)
@@ -321,7 +320,6 @@ class FeaturesEnricher(TransformerMixin):
321
320
  self.raise_validation_error = raise_validation_error
322
321
  self.exclude_columns = exclude_columns
323
322
  self.baseline_score_column = baseline_score_column
324
- self.add_date_if_missing = add_date_if_missing
325
323
 
326
324
  def _get_api_key(self):
327
325
  return self._api_key
@@ -425,9 +423,6 @@ class FeaturesEnricher(TransformerMixin):
425
423
 
426
424
  self.__validate_search_keys(self.search_keys, self.search_id)
427
425
 
428
- # Validate client estimator params
429
- self._get_client_cat_features(estimator, X, self.search_keys)
430
-
431
426
  try:
432
427
  self.X = X
433
428
  self.y = y
@@ -821,7 +816,6 @@ class FeaturesEnricher(TransformerMixin):
821
816
  trace_id = trace_id or str(uuid.uuid4())
822
817
  start_time = time.time()
823
818
  with MDC(trace_id=trace_id):
824
- self.logger.info("Start calculate metrics")
825
819
  if len(args) > 0:
826
820
  msg = f"WARNING: Unsupported positional arguments for calculate_metrics: {args}"
827
821
  self.logger.warning(msg)
@@ -873,9 +867,22 @@ class FeaturesEnricher(TransformerMixin):
873
867
  self.__display_support_link(msg)
874
868
  return None
875
869
 
876
- cat_features, search_keys_for_metrics = self._get_client_cat_features(
877
- estimator, effective_X, self.search_keys
878
- )
870
+ cat_features = None
871
+ search_keys_for_metrics = []
872
+ if (
873
+ estimator is not None
874
+ and hasattr(estimator, "get_param")
875
+ and estimator.get_param("cat_features") is not None
876
+ ):
877
+ cat_features = estimator.get_param("cat_features")
878
+ if len(cat_features) > 0 and isinstance(cat_features[0], int):
879
+ cat_features = [effective_X.columns[i] for i in cat_features]
880
+ for cat_feature in cat_features:
881
+ if cat_feature in self.search_keys:
882
+ if self.search_keys[cat_feature] in [SearchKey.COUNTRY, SearchKey.POSTAL_CODE]:
883
+ search_keys_for_metrics.append(cat_feature)
884
+ else:
885
+ raise ValidationError(self.bundle.get("cat_feature_search_key").format(cat_feature))
879
886
 
880
887
  prepared_data = self._prepare_data_for_metrics(
881
888
  trace_id=trace_id,
@@ -890,7 +897,6 @@ class FeaturesEnricher(TransformerMixin):
890
897
  search_keys_for_metrics=search_keys_for_metrics,
891
898
  progress_bar=progress_bar,
892
899
  progress_callback=progress_callback,
893
- cat_features=cat_features,
894
900
  )
895
901
  if prepared_data is None:
896
902
  return None
@@ -1178,6 +1184,8 @@ class FeaturesEnricher(TransformerMixin):
1178
1184
  search_keys = self.search_keys.copy()
1179
1185
  search_keys = self.__prepare_search_keys(x, search_keys, is_demo_dataset, is_transform=True, silent_mode=True)
1180
1186
 
1187
+ unnest_search_keys = []
1188
+
1181
1189
  extended_X = x.copy()
1182
1190
  generated_features = []
1183
1191
  date_column = self._get_date_column(search_keys)
@@ -1188,7 +1196,7 @@ class FeaturesEnricher(TransformerMixin):
1188
1196
  email_column = self._get_email_column(search_keys)
1189
1197
  hem_column = self._get_hem_column(search_keys)
1190
1198
  if email_column:
1191
- converter = EmailSearchKeyConverter(email_column, hem_column, search_keys, self.logger)
1199
+ converter = EmailSearchKeyConverter(email_column, hem_column, search_keys, unnest_search_keys, self.logger)
1192
1200
  extended_X = converter.convert(extended_X)
1193
1201
  generated_features.extend(converter.generated_features)
1194
1202
  if (
@@ -1266,29 +1274,6 @@ class FeaturesEnricher(TransformerMixin):
1266
1274
 
1267
1275
  return _cv, groups
1268
1276
 
1269
- def _get_client_cat_features(
1270
- self, estimator: Optional[Any], X: pd.DataFrame, search_keys: Dict[str, SearchKey]
1271
- ) -> Optional[List[str]]:
1272
- cat_features = None
1273
- search_keys_for_metrics = []
1274
- if (
1275
- estimator is not None
1276
- and hasattr(estimator, "get_param")
1277
- and estimator.get_param("cat_features") is not None
1278
- ):
1279
- cat_features = estimator.get_param("cat_features")
1280
- if len(cat_features) > 0:
1281
- if all([isinstance(f, int) for f in cat_features]):
1282
- cat_features = [X.columns[i] for i in cat_features]
1283
- self.logger.info(f"Collected categorical features {cat_features} from user estimator")
1284
- for cat_feature in cat_features:
1285
- if cat_feature in search_keys:
1286
- if search_keys[cat_feature] in [SearchKey.COUNTRY, SearchKey.POSTAL_CODE]:
1287
- search_keys_for_metrics.append(cat_feature)
1288
- else:
1289
- raise ValidationError(self.bundle.get("cat_feature_search_key").format(cat_feature))
1290
- return cat_features, search_keys_for_metrics
1291
-
1292
1277
  def _prepare_data_for_metrics(
1293
1278
  self,
1294
1279
  trace_id: str,
@@ -1303,7 +1288,6 @@ class FeaturesEnricher(TransformerMixin):
1303
1288
  search_keys_for_metrics: Optional[List[str]] = None,
1304
1289
  progress_bar: Optional[ProgressBar] = None,
1305
1290
  progress_callback: Optional[Callable[[SearchProgress], Any]] = None,
1306
- cat_features: Optional[List[str]] = None,
1307
1291
  ):
1308
1292
  is_input_same_as_fit, X, y, eval_set = self._is_input_same_as_fit(X, y, eval_set)
1309
1293
  is_demo_dataset = hash_input(X, y, eval_set) in DEMO_DATASET_HASHES
@@ -1361,8 +1345,9 @@ class FeaturesEnricher(TransformerMixin):
1361
1345
 
1362
1346
  # Detect and drop high cardinality columns in train
1363
1347
  columns_with_high_cardinality = FeaturesValidator.find_high_cardinality(fitting_X)
1364
- non_excluding_columns = (self.generate_features or []) + (cat_features or [])
1365
- columns_with_high_cardinality = [c for c in columns_with_high_cardinality if c not in non_excluding_columns]
1348
+ columns_with_high_cardinality = [
1349
+ c for c in columns_with_high_cardinality if c not in (self.generate_features or [])
1350
+ ]
1366
1351
  if len(columns_with_high_cardinality) > 0:
1367
1352
  self.logger.warning(
1368
1353
  f"High cardinality columns {columns_with_high_cardinality} will be dropped for metrics calculation"
@@ -1824,11 +1809,10 @@ class FeaturesEnricher(TransformerMixin):
1824
1809
  else:
1825
1810
  features_section = ""
1826
1811
 
1827
- search_id = self._search_task.search_task_id
1828
- api_example = f"""curl 'https://search.upgini.com/online/api/http_inference_trigger?search_id={search_id}' \\
1812
+ api_example = f"""curl 'https://inference-upgini.azurewebsites.net/api/http_inference_trigger' \\
1829
1813
  -H 'Authorization: {self.api_key}' \\
1830
1814
  -H 'Content-Type: application/json' \\
1831
- -d '{{"search_keys": {keys}{features_section}}}'"""
1815
+ -d '{{"search_id": "{self._search_task.search_task_id}", "search_keys": {keys}{features_section}}}'"""
1832
1816
  return api_example
1833
1817
 
1834
1818
  def _get_copy_of_runtime_parameters(self) -> RuntimeParameters:
@@ -1923,13 +1907,38 @@ class FeaturesEnricher(TransformerMixin):
1923
1907
  generated_features.extend(converter.generated_features)
1924
1908
  else:
1925
1909
  self.logger.info("Input dataset hasn't date column")
1926
- if self.add_date_if_missing:
1927
- df = self._add_current_date_as_key(df, search_keys, self.logger, self.bundle)
1910
+
1911
+ # Don't pass all features in backend on transform
1912
+ original_features_for_transform = []
1913
+ runtime_parameters = self._get_copy_of_runtime_parameters()
1914
+ features_not_to_pass = [column for column in df.columns if column not in search_keys.keys()]
1915
+ if len(features_not_to_pass) > 0:
1916
+ # Pass only features that need for transform
1917
+ features_for_transform = self._search_task.get_features_for_transform()
1918
+ if features_for_transform is not None and len(features_for_transform) > 0:
1919
+ file_metadata = self._search_task.get_file_metadata(trace_id)
1920
+ original_features_for_transform = [
1921
+ c.originalName or c.name for c in file_metadata.columns if c.name in features_for_transform
1922
+ ]
1923
+
1924
+ runtime_parameters.properties["features_for_embeddings"] = ",".join(features_for_transform)
1925
+
1926
+ columns_for_system_record_id = sorted(list(search_keys.keys()) + (original_features_for_transform))
1927
+
1928
+ df[ENTITY_SYSTEM_RECORD_ID] = pd.util.hash_pandas_object(
1929
+ df[columns_for_system_record_id], index=False
1930
+ ).astype("Float64")
1931
+
1932
+ # Explode multiple search keys
1933
+ df, unnest_search_keys = self._explode_multiple_search_keys(df, search_keys)
1934
+
1928
1935
  email_column = self._get_email_column(search_keys)
1929
1936
  hem_column = self._get_hem_column(search_keys)
1930
1937
  email_converted_to_hem = False
1931
1938
  if email_column:
1932
- converter = EmailSearchKeyConverter(email_column, hem_column, search_keys, self.logger)
1939
+ converter = EmailSearchKeyConverter(
1940
+ email_column, hem_column, search_keys, unnest_search_keys, self.logger
1941
+ )
1933
1942
  df = converter.convert(df)
1934
1943
  generated_features.extend(converter.generated_features)
1935
1944
  email_converted_to_hem = converter.email_converted_to_hem
@@ -1943,30 +1952,21 @@ class FeaturesEnricher(TransformerMixin):
1943
1952
  generated_features = [f for f in generated_features if f in self.fit_generated_features]
1944
1953
 
1945
1954
  meaning_types = {col: key.value for col, key in search_keys.items()}
1946
- non_keys_columns = [column for column in df.columns if column not in search_keys.keys()]
1955
+ # non_keys_columns = [column for column in df.columns if column not in search_keys.keys()]
1956
+ for col in original_features_for_transform:
1957
+ meaning_types[col] = FileColumnMeaningType.FEATURE
1958
+ features_not_to_pass = [column for column in features_not_to_pass if column not in search_keys.keys()]
1947
1959
 
1948
1960
  if email_converted_to_hem:
1949
- non_keys_columns.append(email_column)
1950
-
1951
- # Don't pass features in backend on transform
1952
- original_features_for_transform = None
1953
- runtime_parameters = self._get_copy_of_runtime_parameters()
1954
- if len(non_keys_columns) > 0:
1955
- # Pass only features that need for transform
1956
- features_for_transform = self._search_task.get_features_for_transform()
1957
- if features_for_transform is not None and len(features_for_transform) > 0:
1958
- file_metadata = self._search_task.get_file_metadata(trace_id)
1959
- original_features_for_transform = [
1960
- c.originalName or c.name for c in file_metadata.columns if c.name in features_for_transform
1961
- ]
1962
- non_keys_columns = [c for c in non_keys_columns if c not in original_features_for_transform]
1961
+ features_not_to_pass.append(email_column)
1963
1962
 
1964
- runtime_parameters.properties["features_for_embeddings"] = ",".join(features_for_transform)
1963
+ features_not_to_pass = [c for c in features_not_to_pass if c not in original_features_for_transform]
1964
+ columns_for_system_record_id = sorted(list(search_keys.keys()) + (original_features_for_transform))
1965
1965
 
1966
1966
  if add_fit_system_record_id:
1967
1967
  df = self.__add_fit_system_record_id(df, dict(), search_keys)
1968
1968
  df = df.rename(columns={SYSTEM_RECORD_ID: SORT_ID})
1969
- non_keys_columns.append(SORT_ID)
1969
+ features_not_to_pass.append(SORT_ID)
1970
1970
 
1971
1971
  columns_for_system_record_id = sorted(list(search_keys.keys()) + (original_features_for_transform or []))
1972
1972
 
@@ -1974,16 +1974,19 @@ class FeaturesEnricher(TransformerMixin):
1974
1974
  "Float64"
1975
1975
  )
1976
1976
  meaning_types[SYSTEM_RECORD_ID] = FileColumnMeaningType.SYSTEM_RECORD_ID
1977
+ meaning_types[ENTITY_SYSTEM_RECORD_ID] = FileColumnMeaningType.ENTITY_SYSTEM_RECORD_ID
1978
+ if SEARCH_KEY_UNNEST in df.columns:
1979
+ meaning_types[SEARCH_KEY_UNNEST] = FileColumnMeaningType.UNNEST_KEY
1977
1980
 
1978
1981
  df = df.reset_index(drop=True)
1979
- system_columns_with_original_index = [SYSTEM_RECORD_ID] + generated_features
1982
+ system_columns_with_original_index = [SYSTEM_RECORD_ID, ENTITY_SYSTEM_RECORD_ID] + generated_features
1980
1983
  if add_fit_system_record_id:
1981
1984
  system_columns_with_original_index.append(SORT_ID)
1982
1985
  df_with_original_index = df[system_columns_with_original_index].copy()
1983
1986
 
1984
1987
  combined_search_keys = combine_search_keys(search_keys.keys())
1985
1988
 
1986
- df_without_features = df.drop(columns=non_keys_columns)
1989
+ df_without_features = df.drop(columns=features_not_to_pass)
1987
1990
 
1988
1991
  df_without_features = clean_full_duplicates(
1989
1992
  df_without_features, self.logger, silent=silent_mode, bundle=self.bundle
@@ -2139,6 +2142,14 @@ class FeaturesEnricher(TransformerMixin):
2139
2142
 
2140
2143
  key_types = search_keys.values()
2141
2144
 
2145
+ # Multiple search keys allowed only for PHONE, IP, POSTAL_CODE, EMAIL, HEM
2146
+ multi_keys = [key for key, count in Counter(key_types).items() if count > 1]
2147
+ for multi_key in multi_keys:
2148
+ if multi_key not in [SearchKey.PHONE, SearchKey.IP, SearchKey.POSTAL_CODE, SearchKey.EMAIL, SearchKey.HEM]:
2149
+ msg = self.bundle.get("unsupported_multi_key").format(multi_key)
2150
+ self.logger.warning(msg)
2151
+ raise ValidationError(msg)
2152
+
2142
2153
  if SearchKey.DATE in key_types and SearchKey.DATETIME in key_types:
2143
2154
  msg = self.bundle.get("date_and_datetime_simultanious")
2144
2155
  self.logger.warning(msg)
@@ -2154,11 +2165,11 @@ class FeaturesEnricher(TransformerMixin):
2154
2165
  self.logger.warning(msg)
2155
2166
  raise ValidationError(msg)
2156
2167
 
2157
- for key_type in SearchKey.__members__.values():
2158
- if key_type != SearchKey.CUSTOM_KEY and list(key_types).count(key_type) > 1:
2159
- msg = self.bundle.get("multiple_search_key").format(key_type)
2160
- self.logger.warning(msg)
2161
- raise ValidationError(msg)
2168
+ # for key_type in SearchKey.__members__.values():
2169
+ # if key_type != SearchKey.CUSTOM_KEY and list(key_types).count(key_type) > 1:
2170
+ # msg = self.bundle.get("multiple_search_key").format(key_type)
2171
+ # self.logger.warning(msg)
2172
+ # raise ValidationError(msg)
2162
2173
 
2163
2174
  # non_personal_keys = set(SearchKey.__members__.values()) - set(SearchKey.personal_keys())
2164
2175
  # if (
@@ -2294,16 +2305,7 @@ class FeaturesEnricher(TransformerMixin):
2294
2305
  self.fit_generated_features.extend(converter.generated_features)
2295
2306
  else:
2296
2307
  self.logger.info("Input dataset hasn't date column")
2297
- if self.add_date_if_missing:
2298
- df = self._add_current_date_as_key(df, self.fit_search_keys, self.logger, self.bundle)
2299
- email_column = self._get_email_column(self.fit_search_keys)
2300
- hem_column = self._get_hem_column(self.fit_search_keys)
2301
- email_converted_to_hem = False
2302
- if email_column:
2303
- converter = EmailSearchKeyConverter(email_column, hem_column, self.fit_search_keys, self.logger)
2304
- df = converter.convert(df)
2305
- self.fit_generated_features.extend(converter.generated_features)
2306
- email_converted_to_hem = converter.email_converted_to_hem
2308
+
2307
2309
  if (
2308
2310
  self.detect_missing_search_keys
2309
2311
  and list(self.fit_search_keys.values()) == [SearchKey.DATE]
@@ -2312,7 +2314,37 @@ class FeaturesEnricher(TransformerMixin):
2312
2314
  converter = IpToCountrySearchKeyConverter(self.fit_search_keys, self.logger)
2313
2315
  df = converter.convert(df)
2314
2316
 
2317
+ # Explode multiple search keys
2315
2318
  non_feature_columns = [self.TARGET_NAME, EVAL_SET_INDEX] + list(self.fit_search_keys.keys())
2319
+ meaning_types = {
2320
+ **{col: key.value for col, key in self.fit_search_keys.items()},
2321
+ **{str(c): FileColumnMeaningType.FEATURE for c in df.columns if c not in non_feature_columns},
2322
+ }
2323
+ meaning_types[self.TARGET_NAME] = FileColumnMeaningType.TARGET
2324
+ if eval_set is not None and len(eval_set) > 0:
2325
+ meaning_types[EVAL_SET_INDEX] = FileColumnMeaningType.EVAL_SET_INDEX
2326
+ df = self.__add_fit_system_record_id(df, meaning_types, self.fit_search_keys, ENTITY_SYSTEM_RECORD_ID)
2327
+
2328
+ # TODO check that this is correct for enrichment
2329
+ self.df_with_original_index = df.copy()
2330
+
2331
+ df, unnest_search_keys = self._explode_multiple_search_keys(df, self.fit_search_keys)
2332
+
2333
+ # Convert EMAIL to HEM after unnesting to do it only with one column
2334
+ email_column = self._get_email_column(self.fit_search_keys)
2335
+ hem_column = self._get_hem_column(self.fit_search_keys)
2336
+ email_converted_to_hem = False
2337
+ if email_column:
2338
+ converter = EmailSearchKeyConverter(
2339
+ email_column, hem_column, self.fit_search_keys, unnest_search_keys, self.logger
2340
+ )
2341
+ df = converter.convert(df)
2342
+ self.fit_generated_features.extend(converter.generated_features)
2343
+ email_converted_to_hem = converter.email_converted_to_hem
2344
+
2345
+ non_feature_columns = [self.TARGET_NAME, EVAL_SET_INDEX, ENTITY_SYSTEM_RECORD_ID, SEARCH_KEY_UNNEST] + list(
2346
+ self.fit_search_keys.keys()
2347
+ )
2316
2348
  if email_converted_to_hem:
2317
2349
  non_feature_columns.append(email_column)
2318
2350
  if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
@@ -2336,12 +2368,14 @@ class FeaturesEnricher(TransformerMixin):
2336
2368
  **{str(c): FileColumnMeaningType.FEATURE for c in df.columns if c not in non_feature_columns},
2337
2369
  }
2338
2370
  meaning_types[self.TARGET_NAME] = FileColumnMeaningType.TARGET
2371
+ meaning_types[ENTITY_SYSTEM_RECORD_ID] = FileColumnMeaningType.ENTITY_SYSTEM_RECORD_ID
2372
+ if SEARCH_KEY_UNNEST in df.columns:
2373
+ meaning_types[SEARCH_KEY_UNNEST] = FileColumnMeaningType.UNNEST_KEY
2339
2374
  if eval_set is not None and len(eval_set) > 0:
2340
2375
  meaning_types[EVAL_SET_INDEX] = FileColumnMeaningType.EVAL_SET_INDEX
2341
2376
 
2342
- df = self.__add_fit_system_record_id(df, meaning_types, self.fit_search_keys)
2377
+ df = self.__add_fit_system_record_id(df, meaning_types, self.fit_search_keys, SYSTEM_RECORD_ID)
2343
2378
 
2344
- self.df_with_original_index = df.copy()
2345
2379
  df = df.reset_index(drop=True).sort_values(by=SYSTEM_RECORD_ID).reset_index(drop=True)
2346
2380
 
2347
2381
  combined_search_keys = combine_search_keys(self.fit_search_keys.keys())
@@ -2349,14 +2383,15 @@ class FeaturesEnricher(TransformerMixin):
2349
2383
  dataset = Dataset(
2350
2384
  "tds_" + str(uuid.uuid4()),
2351
2385
  df=df,
2386
+ meaning_types=meaning_types,
2387
+ search_keys=combined_search_keys,
2388
+ unnest_search_keys=unnest_search_keys,
2352
2389
  model_task_type=model_task_type,
2353
2390
  date_format=self.date_format,
2354
2391
  random_state=self.random_state,
2355
2392
  rest_client=self.rest_client,
2356
2393
  logger=self.logger,
2357
2394
  )
2358
- dataset.meaning_types = meaning_types
2359
- dataset.search_keys = combined_search_keys
2360
2395
  if email_converted_to_hem:
2361
2396
  dataset.ignore_columns = [email_column]
2362
2397
 
@@ -2876,25 +2911,6 @@ class FeaturesEnricher(TransformerMixin):
2876
2911
  if t in [SearchKey.DATE, SearchKey.DATETIME]:
2877
2912
  return col
2878
2913
 
2879
- @staticmethod
2880
- def _add_current_date_as_key(
2881
- df: pd.DataFrame, search_keys: Dict[str, SearchKey], logger: logging.Logger, bundle: ResourceBundle
2882
- ) -> pd.DataFrame:
2883
- if (
2884
- set(search_keys.values()) == {SearchKey.PHONE}
2885
- or set(search_keys.values()) == {SearchKey.EMAIL}
2886
- or set(search_keys.values()) == {SearchKey.HEM}
2887
- or set(search_keys.values()) == {SearchKey.COUNTRY, SearchKey.POSTAL_CODE}
2888
- ):
2889
- msg = bundle.get("current_date_added")
2890
- print(msg)
2891
- logger.warning(msg)
2892
- df[FeaturesEnricher.CURRENT_DATE] = datetime.date.today()
2893
- search_keys[FeaturesEnricher.CURRENT_DATE] = SearchKey.DATE
2894
- converter = DateTimeSearchKeyConverter(FeaturesEnricher.CURRENT_DATE, None, logger, bundle)
2895
- df = converter.convert(df)
2896
- return df
2897
-
2898
2914
  @staticmethod
2899
2915
  def _get_group_columns(df: pd.DataFrame, search_keys: Dict[str, SearchKey]) -> List[str]:
2900
2916
  return [
@@ -2905,15 +2921,19 @@ class FeaturesEnricher(TransformerMixin):
2905
2921
 
2906
2922
  @staticmethod
2907
2923
  def _get_email_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
2908
- for col, t in search_keys.items():
2909
- if t == SearchKey.EMAIL:
2910
- return col
2924
+ cols = [col for col, t in search_keys.items() if t == SearchKey.EMAIL]
2925
+ if len(cols) > 1:
2926
+ raise Exception("More than one email column found after unnest")
2927
+ if len(cols) == 1:
2928
+ return cols[0]
2911
2929
 
2912
2930
  @staticmethod
2913
2931
  def _get_hem_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
2914
- for col, t in search_keys.items():
2915
- if t == SearchKey.HEM:
2916
- return col
2932
+ cols = [col for col, t in search_keys.items() if t == SearchKey.HEM]
2933
+ if len(cols) > 1:
2934
+ raise Exception("More than one hem column found after unnest")
2935
+ if len(cols) == 1:
2936
+ return cols[0]
2917
2937
 
2918
2938
  @staticmethod
2919
2939
  def _get_phone_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
@@ -2921,8 +2941,42 @@ class FeaturesEnricher(TransformerMixin):
2921
2941
  if t == SearchKey.PHONE:
2922
2942
  return col
2923
2943
 
2944
+ def _explode_multiple_search_keys(self, df: pd.DataFrame, search_keys: Dict[str, SearchKey]) -> pd.DataFrame:
2945
+ # find groups of multiple search keys
2946
+ search_key_names_by_type: Dict[SearchKey, str] = dict()
2947
+ for key_name, key_type in search_keys.items():
2948
+ search_key_names_by_type[key_type] = search_key_names_by_type.get(key_type, []) + [key_name]
2949
+ search_key_names_by_type = {
2950
+ key_type: key_names for key_type, key_names in search_key_names_by_type.items() if len(key_names) > 1
2951
+ }
2952
+ if len(search_key_names_by_type) == 0:
2953
+ return df, []
2954
+
2955
+ multiple_keys_columns = [col for cols in search_key_names_by_type.values() for col in cols]
2956
+ other_columns = [col for col in df.columns if col not in multiple_keys_columns]
2957
+ exploded_dfs = []
2958
+ unnest_search_keys = []
2959
+
2960
+ for key_type, key_names in search_key_names_by_type.items():
2961
+ new_search_key = f"upgini_{key_type.name.lower()}_unnest"
2962
+ exploded_df = pd.melt(
2963
+ df, id_vars=other_columns, value_vars=key_names, var_name=SEARCH_KEY_UNNEST, value_name=new_search_key
2964
+ )
2965
+ exploded_dfs.append(exploded_df)
2966
+ for old_key in key_names:
2967
+ del search_keys[old_key]
2968
+ search_keys[new_search_key] = key_type
2969
+ unnest_search_keys.append(new_search_key)
2970
+
2971
+ df = pd.concat(exploded_dfs, ignore_index=True)
2972
+ return df, unnest_search_keys
2973
+
2924
2974
  def __add_fit_system_record_id(
2925
- self, df: pd.DataFrame, meaning_types: Dict[str, FileColumnMeaningType], search_keys: Dict[str, SearchKey]
2975
+ self,
2976
+ df: pd.DataFrame,
2977
+ meaning_types: Dict[str, FileColumnMeaningType],
2978
+ search_keys: Dict[str, SearchKey],
2979
+ id_name: str,
2926
2980
  ) -> pd.DataFrame:
2927
2981
  # save original order or rows
2928
2982
  original_index_name = df.index.name
@@ -2971,19 +3025,23 @@ class FeaturesEnricher(TransformerMixin):
2971
3025
 
2972
3026
  df = df.reset_index(drop=True).reset_index()
2973
3027
  # system_record_id saves correct order for fit
2974
- df = df.rename(columns={DEFAULT_INDEX: SYSTEM_RECORD_ID})
3028
+ df = df.rename(columns={DEFAULT_INDEX: id_name})
2975
3029
 
2976
3030
  # return original order
2977
3031
  df = df.set_index(ORIGINAL_INDEX)
2978
3032
  df.index.name = original_index_name
2979
3033
  df = df.sort_values(by=original_order_name).drop(columns=original_order_name)
2980
3034
 
2981
- meaning_types[SYSTEM_RECORD_ID] = FileColumnMeaningType.SYSTEM_RECORD_ID
3035
+ meaning_types[id_name] = (
3036
+ FileColumnMeaningType.SYSTEM_RECORD_ID
3037
+ if id_name == SYSTEM_RECORD_ID
3038
+ else FileColumnMeaningType.ENTITY_SYSTEM_RECORD_ID
3039
+ )
2982
3040
  return df
2983
3041
 
2984
3042
  def __correct_target(self, df: pd.DataFrame) -> pd.DataFrame:
2985
3043
  target = df[self.TARGET_NAME]
2986
- if is_string_dtype(target) or is_object_dtype(target):
3044
+ if is_string_dtype(target):
2987
3045
  maybe_numeric_target = pd.to_numeric(target, errors="coerce")
2988
3046
  # If less than 5% is non numeric then leave this rows with NaN target and later it will be dropped
2989
3047
  if maybe_numeric_target.isna().sum() <= _num_samples(df) * 0.05:
@@ -3033,7 +3091,10 @@ class FeaturesEnricher(TransformerMixin):
3033
3091
  )
3034
3092
 
3035
3093
  comparing_columns = X.columns if is_transform else df_with_original_index.columns
3036
- dup_features = [c for c in comparing_columns if c in result_features.columns and c != SYSTEM_RECORD_ID]
3094
+ dup_features = [
3095
+ c for c in comparing_columns
3096
+ if c in result_features.columns and c not in [SYSTEM_RECORD_ID, ENTITY_SYSTEM_RECORD_ID]
3097
+ ]
3037
3098
  if len(dup_features) > 0:
3038
3099
  self.logger.warning(f"X contain columns with same name as returned from backend: {dup_features}")
3039
3100
  raise ValidationError(self.bundle.get("returned_features_same_as_passed").format(dup_features))
@@ -3044,8 +3105,7 @@ class FeaturesEnricher(TransformerMixin):
3044
3105
  result_features = pd.merge(
3045
3106
  df_with_original_index,
3046
3107
  result_features,
3047
- left_on=SYSTEM_RECORD_ID,
3048
- right_on=SYSTEM_RECORD_ID,
3108
+ on=ENTITY_SYSTEM_RECORD_ID,
3049
3109
  how="left" if is_transform else "inner",
3050
3110
  )
3051
3111
  result_features = result_features.set_index(original_index_name or DEFAULT_INDEX)
@@ -3383,8 +3443,7 @@ class FeaturesEnricher(TransformerMixin):
3383
3443
  valid_search_keys[column_name] = SearchKey.CUSTOM_KEY
3384
3444
  else:
3385
3445
  if x[column_name].isnull().all() or (
3386
- (is_string_dtype(x[column_name]) or is_object_dtype(x[column_name]))
3387
- and (x[column_name].astype("string").str.strip() == "").all()
3446
+ is_string_dtype(x[column_name]) and (x[column_name].astype("string").str.strip() == "").all()
3388
3447
  ):
3389
3448
  raise ValidationError(self.bundle.get("empty_search_key").format(column_name))
3390
3449
 
@@ -3426,13 +3485,13 @@ class FeaturesEnricher(TransformerMixin):
3426
3485
  self.warning_counter.increment()
3427
3486
 
3428
3487
  if len(valid_search_keys) == 1:
3429
- for k, v in valid_search_keys.items():
3430
- # Show warning for country only if country is the only key
3431
- if x[k].nunique() == 1 and (v != SearchKey.COUNTRY or len(valid_search_keys) == 1):
3432
- msg = self.bundle.get("single_constant_search_key").format(v, x[k].values[0])
3433
- print(msg)
3434
- self.logger.warning(msg)
3435
- self.warning_counter.increment()
3488
+ key, value = list(valid_search_keys.items())[0]
3489
+ # Show warning for country only if country is the only key
3490
+ if x[key].nunique() == 1:
3491
+ msg = self.bundle.get("single_constant_search_key").format(value, x[key].values[0])
3492
+ print(msg)
3493
+ self.logger.warning(msg)
3494
+ self.warning_counter.increment()
3436
3495
 
3437
3496
  self.logger.info(f"Prepared search keys: {valid_search_keys}")
3438
3497
 
@@ -3542,61 +3601,68 @@ class FeaturesEnricher(TransformerMixin):
3542
3601
  def check_need_detect(search_key: SearchKey):
3543
3602
  return not is_transform or search_key in self.fit_search_keys.values()
3544
3603
 
3545
- if SearchKey.POSTAL_CODE not in search_keys.values() and check_need_detect(SearchKey.POSTAL_CODE):
3546
- maybe_key = PostalCodeSearchKeyDetector().get_search_key_column(sample)
3547
- if maybe_key is not None:
3548
- search_keys[maybe_key] = SearchKey.POSTAL_CODE
3549
- self.autodetected_search_keys[maybe_key] = SearchKey.POSTAL_CODE
3550
- self.logger.info(f"Autodetected search key POSTAL_CODE in column {maybe_key}")
3604
+ # if SearchKey.POSTAL_CODE not in search_keys.values() and check_need_detect(SearchKey.POSTAL_CODE):
3605
+ if check_need_detect(SearchKey.POSTAL_CODE):
3606
+ maybe_keys = PostalCodeSearchKeyDetector().get_search_key_columns(sample, search_keys)
3607
+ if maybe_keys:
3608
+ new_keys = {key: SearchKey.POSTAL_CODE for key in maybe_keys}
3609
+ search_keys.update(new_keys)
3610
+ self.autodetected_search_keys.update(new_keys)
3611
+ self.logger.info(f"Autodetected search key POSTAL_CODE in column {maybe_keys}")
3551
3612
  if not silent_mode:
3552
- print(self.bundle.get("postal_code_detected").format(maybe_key))
3613
+ print(self.bundle.get("postal_code_detected").format(maybe_keys))
3553
3614
 
3554
3615
  if (
3555
3616
  SearchKey.COUNTRY not in search_keys.values()
3556
3617
  and self.country_code is None
3557
3618
  and check_need_detect(SearchKey.COUNTRY)
3558
3619
  ):
3559
- maybe_key = CountrySearchKeyDetector().get_search_key_column(sample)
3560
- if maybe_key is not None:
3561
- search_keys[maybe_key] = SearchKey.COUNTRY
3562
- self.autodetected_search_keys[maybe_key] = SearchKey.COUNTRY
3620
+ maybe_key = CountrySearchKeyDetector().get_search_key_columns(sample, search_keys)
3621
+ if maybe_key:
3622
+ search_keys[maybe_key[0]] = SearchKey.COUNTRY
3623
+ self.autodetected_search_keys[maybe_key[0]] = SearchKey.COUNTRY
3563
3624
  self.logger.info(f"Autodetected search key COUNTRY in column {maybe_key}")
3564
3625
  if not silent_mode:
3565
3626
  print(self.bundle.get("country_detected").format(maybe_key))
3566
3627
 
3567
3628
  if (
3568
- SearchKey.EMAIL not in search_keys.values()
3569
- and SearchKey.HEM not in search_keys.values()
3629
+ # SearchKey.EMAIL not in search_keys.values()
3630
+ SearchKey.HEM not in search_keys.values()
3570
3631
  and check_need_detect(SearchKey.HEM)
3571
3632
  ):
3572
- maybe_key = EmailSearchKeyDetector().get_search_key_column(sample)
3573
- if maybe_key is not None and maybe_key not in search_keys.keys():
3633
+ maybe_keys = EmailSearchKeyDetector().get_search_key_columns(sample, search_keys)
3634
+ if maybe_keys:
3574
3635
  if self.__is_registered or is_demo_dataset:
3575
- search_keys[maybe_key] = SearchKey.EMAIL
3576
- self.autodetected_search_keys[maybe_key] = SearchKey.EMAIL
3577
- self.logger.info(f"Autodetected search key EMAIL in column {maybe_key}")
3636
+ new_keys = {key: SearchKey.EMAIL for key in maybe_keys}
3637
+ search_keys.update(new_keys)
3638
+ self.autodetected_search_keys.update(new_keys)
3639
+ self.logger.info(f"Autodetected search key EMAIL in column {maybe_keys}")
3578
3640
  if not silent_mode:
3579
- print(self.bundle.get("email_detected").format(maybe_key))
3641
+ print(self.bundle.get("email_detected").format(maybe_keys))
3580
3642
  else:
3581
3643
  self.logger.warning(
3582
- f"Autodetected search key EMAIL in column {maybe_key}. But not used because not registered user"
3644
+ f"Autodetected search key EMAIL in column {maybe_keys}."
3645
+ " But not used because not registered user"
3583
3646
  )
3584
3647
  if not silent_mode:
3585
- print(self.bundle.get("email_detected_not_registered").format(maybe_key))
3648
+ print(self.bundle.get("email_detected_not_registered").format(maybe_keys))
3586
3649
  self.warning_counter.increment()
3587
3650
 
3588
- if SearchKey.PHONE not in search_keys.values() and check_need_detect(SearchKey.PHONE):
3589
- maybe_key = PhoneSearchKeyDetector().get_search_key_column(sample)
3590
- if maybe_key is not None and maybe_key not in search_keys.keys():
3651
+ # if SearchKey.PHONE not in search_keys.values() and check_need_detect(SearchKey.PHONE):
3652
+ if check_need_detect(SearchKey.PHONE):
3653
+ maybe_keys = PhoneSearchKeyDetector().get_search_key_columns(sample, search_keys)
3654
+ if maybe_keys:
3591
3655
  if self.__is_registered or is_demo_dataset:
3592
- search_keys[maybe_key] = SearchKey.PHONE
3593
- self.autodetected_search_keys[maybe_key] = SearchKey.PHONE
3594
- self.logger.info(f"Autodetected search key PHONE in column {maybe_key}")
3656
+ new_keys = {key: SearchKey.PHONE for key in maybe_keys}
3657
+ search_keys.update(new_keys)
3658
+ self.autodetected_search_keys.update(new_keys)
3659
+ self.logger.info(f"Autodetected search key PHONE in column {maybe_keys}")
3595
3660
  if not silent_mode:
3596
- print(self.bundle.get("phone_detected").format(maybe_key))
3661
+ print(self.bundle.get("phone_detected").format(maybe_keys))
3597
3662
  else:
3598
3663
  self.logger.warning(
3599
- f"Autodetected search key PHONE in column {maybe_key}. But not used because not registered user"
3664
+ f"Autodetected search key PHONE in column {maybe_keys}. "
3665
+ "But not used because not registered user"
3600
3666
  )
3601
3667
  if not silent_mode:
3602
3668
  print(self.bundle.get("phone_detected_not_registered"))