upgini 1.1.278a2__py3-none-any.whl → 1.1.279__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. upgini/__about__.py +1 -0
  2. upgini/ads_management/ads_manager.py +4 -2
  3. upgini/autofe/all_operands.py +3 -2
  4. upgini/autofe/binary.py +2 -1
  5. upgini/autofe/date.py +2 -1
  6. upgini/autofe/feature.py +1 -1
  7. upgini/autofe/groupby.py +3 -1
  8. upgini/autofe/operand.py +4 -3
  9. upgini/autofe/unary.py +2 -1
  10. upgini/autofe/vector.py +2 -0
  11. upgini/dataset.py +6 -15
  12. upgini/errors.py +1 -1
  13. upgini/features_enricher.py +104 -217
  14. upgini/http.py +11 -10
  15. upgini/mdc/__init__.py +1 -3
  16. upgini/mdc/context.py +4 -6
  17. upgini/metadata.py +5 -10
  18. upgini/metrics.py +102 -100
  19. upgini/normalizer/phone_normalizer.py +1 -1
  20. upgini/resource_bundle/__init__.py +5 -5
  21. upgini/resource_bundle/strings.properties +0 -1
  22. upgini/sampler/base.py +1 -4
  23. upgini/sampler/random_under_sampler.py +2 -5
  24. upgini/search_task.py +4 -4
  25. upgini/spinner.py +1 -1
  26. upgini/utils/__init__.py +1 -1
  27. upgini/utils/base_search_key_detector.py +14 -16
  28. upgini/utils/blocked_time_series.py +4 -2
  29. upgini/utils/country_utils.py +1 -1
  30. upgini/utils/custom_loss_utils.py +3 -2
  31. upgini/utils/cv_utils.py +2 -2
  32. upgini/utils/datetime_utils.py +20 -15
  33. upgini/utils/deduplicate_utils.py +1 -11
  34. upgini/utils/email_utils.py +2 -7
  35. upgini/utils/fallback_progress_bar.py +1 -1
  36. upgini/utils/progress_bar.py +1 -1
  37. upgini/utils/sklearn_ext.py +14 -13
  38. upgini/utils/track_info.py +2 -2
  39. upgini/version_validator.py +2 -2
  40. {upgini-1.1.278a2.dist-info → upgini-1.1.279.dist-info}/METADATA +21 -23
  41. upgini-1.1.279.dist-info/RECORD +62 -0
  42. {upgini-1.1.278a2.dist-info → upgini-1.1.279.dist-info}/WHEEL +1 -2
  43. upgini-1.1.278a2.dist-info/RECORD +0 -62
  44. upgini-1.1.278a2.dist-info/top_level.txt +0 -1
  45. {upgini-1.1.278a2.dist-info → upgini-1.1.279.dist-info/licenses}/LICENSE +0 -0
@@ -11,7 +11,6 @@ import sys
11
11
  import tempfile
12
12
  import time
13
13
  import uuid
14
- from collections import Counter
15
14
  from dataclasses import dataclass
16
15
  from threading import Thread
17
16
  from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union
@@ -46,11 +45,9 @@ from upgini.mdc import MDC
46
45
  from upgini.metadata import (
47
46
  COUNTRY,
48
47
  DEFAULT_INDEX,
49
- ENTITY_SYSTEM_RECORD_ID,
50
48
  EVAL_SET_INDEX,
51
49
  ORIGINAL_INDEX,
52
50
  RENAMED_INDEX,
53
- SEARCH_KEY_UNNEST,
54
51
  SORT_ID,
55
52
  SYSTEM_RECORD_ID,
56
53
  TARGET,
@@ -251,7 +248,7 @@ class FeaturesEnricher(TransformerMixin):
251
248
  self.__cached_sampled_datasets: Optional[Tuple[pd.DataFrame, pd.DataFrame, pd.Series, Dict, Dict]] = None
252
249
 
253
250
  validate_version(self.logger)
254
- self.search_keys = search_keys or {}
251
+ self.search_keys = search_keys or dict()
255
252
  self.country_code = country_code
256
253
  self.__validate_search_keys(search_keys, search_id)
257
254
  self.model_task_type = model_task_type
@@ -1191,7 +1188,7 @@ class FeaturesEnricher(TransformerMixin):
1191
1188
  email_column = self._get_email_column(search_keys)
1192
1189
  hem_column = self._get_hem_column(search_keys)
1193
1190
  if email_column:
1194
- converter = EmailSearchKeyConverter(email_column, hem_column, search_keys, [], self.logger)
1191
+ converter = EmailSearchKeyConverter(email_column, hem_column, search_keys, self.logger)
1195
1192
  extended_X = converter.convert(extended_X)
1196
1193
  generated_features.extend(converter.generated_features)
1197
1194
  if (
@@ -1343,7 +1340,7 @@ class FeaturesEnricher(TransformerMixin):
1343
1340
  not in (
1344
1341
  excluding_search_keys
1345
1342
  + list(self.fit_dropped_features)
1346
- + [DateTimeSearchKeyConverter.DATETIME_COL, SYSTEM_RECORD_ID, ENTITY_SYSTEM_RECORD_ID]
1343
+ + [DateTimeSearchKeyConverter.DATETIME_COL, SYSTEM_RECORD_ID]
1347
1344
  )
1348
1345
  ]
1349
1346
 
@@ -1407,7 +1404,7 @@ class FeaturesEnricher(TransformerMixin):
1407
1404
  fitting_enriched_X[col].astype("string").str.replace(",", ".").astype(np.float64)
1408
1405
  )
1409
1406
 
1410
- fitting_eval_set_dict = {}
1407
+ fitting_eval_set_dict = dict()
1411
1408
  for idx, eval_tuple in eval_set_sampled_dict.items():
1412
1409
  eval_X_sampled, enriched_eval_X, eval_y_sampled = eval_tuple
1413
1410
  eval_X_sorted, eval_y_sorted = self._sort_by_system_record_id(eval_X_sampled, eval_y_sampled, self.cv)
@@ -1519,7 +1516,7 @@ class FeaturesEnricher(TransformerMixin):
1519
1516
  def __sample_only_input(
1520
1517
  self, validated_X: pd.DataFrame, validated_y: pd.Series, eval_set: Optional[List[tuple]], is_demo_dataset: bool
1521
1518
  ) -> _SampledDataForMetrics:
1522
- eval_set_sampled_dict = {}
1519
+ eval_set_sampled_dict = dict()
1523
1520
 
1524
1521
  df = validated_X.copy()
1525
1522
  df[TARGET] = validated_y
@@ -1545,7 +1542,7 @@ class FeaturesEnricher(TransformerMixin):
1545
1542
  df = df.sample(n=sample_rows, random_state=self.random_state)
1546
1543
 
1547
1544
  df_extended, search_keys = self._extend_x(df, is_demo_dataset)
1548
- df_extended = self.__add_fit_system_record_id(df_extended, {}, search_keys)
1545
+ df_extended = self.__add_fit_system_record_id(df_extended, dict(), search_keys)
1549
1546
 
1550
1547
  train_df = df_extended.query(f"{EVAL_SET_INDEX} == 0") if eval_set is not None else df_extended
1551
1548
  X_sampled = train_df.drop(columns=[TARGET, EVAL_SET_INDEX], errors="ignore")
@@ -1569,7 +1566,7 @@ class FeaturesEnricher(TransformerMixin):
1569
1566
  trace_id: str,
1570
1567
  remove_outliers_calc_metrics: Optional[bool],
1571
1568
  ) -> _SampledDataForMetrics:
1572
- eval_set_sampled_dict = {}
1569
+ eval_set_sampled_dict = dict()
1573
1570
  search_keys = self.fit_search_keys
1574
1571
 
1575
1572
  rows_to_drop = None
@@ -1643,7 +1640,7 @@ class FeaturesEnricher(TransformerMixin):
1643
1640
  progress_bar: Optional[ProgressBar],
1644
1641
  progress_callback: Optional[Callable[[SearchProgress], Any]],
1645
1642
  ) -> _SampledDataForMetrics:
1646
- eval_set_sampled_dict = {}
1643
+ eval_set_sampled_dict = dict()
1647
1644
  if eval_set is not None:
1648
1645
  self.logger.info("Transform with eval_set")
1649
1646
  # concatenate X and eval_set with eval_set_index
@@ -1665,7 +1662,7 @@ class FeaturesEnricher(TransformerMixin):
1665
1662
  self.logger.info(f"Downsampling from {num_samples} to {Dataset.FIT_SAMPLE_WITH_EVAL_SET_ROWS}")
1666
1663
  df = df.sample(n=Dataset.FIT_SAMPLE_WITH_EVAL_SET_ROWS, random_state=self.random_state)
1667
1664
 
1668
- eval_set_sampled_dict = {}
1665
+ eval_set_sampled_dict = dict()
1669
1666
 
1670
1667
  tmp_target_name = "__target"
1671
1668
  df = df.rename(columns={TARGET: tmp_target_name})
@@ -1928,38 +1925,11 @@ class FeaturesEnricher(TransformerMixin):
1928
1925
  self.logger.info("Input dataset hasn't date column")
1929
1926
  if self.add_date_if_missing:
1930
1927
  df = self._add_current_date_as_key(df, search_keys, self.logger, self.bundle)
1931
-
1932
- # Don't pass all features in backend on transform
1933
- original_features_for_transform = []
1934
- runtime_parameters = self._get_copy_of_runtime_parameters()
1935
- features_not_to_pass = [column for column in df.columns if column not in search_keys.keys()]
1936
- if len(features_not_to_pass) > 0:
1937
- # Pass only features that need for transform
1938
- features_for_transform = self._search_task.get_features_for_transform()
1939
- if features_for_transform is not None and len(features_for_transform) > 0:
1940
- file_metadata = self._search_task.get_file_metadata(trace_id)
1941
- original_features_for_transform = [
1942
- c.originalName or c.name for c in file_metadata.columns if c.name in features_for_transform
1943
- ]
1944
-
1945
- runtime_parameters.properties["features_for_embeddings"] = ",".join(features_for_transform)
1946
-
1947
- columns_for_system_record_id = sorted(list(search_keys.keys()) + (original_features_for_transform))
1948
-
1949
- df[ENTITY_SYSTEM_RECORD_ID] = pd.util.hash_pandas_object(
1950
- df[columns_for_system_record_id], index=False
1951
- ).astype("Float64")
1952
-
1953
- # Explode multiple search keys
1954
- df, unnest_search_keys = self._explode_multiple_search_keys(df, search_keys)
1955
-
1956
1928
  email_column = self._get_email_column(search_keys)
1957
1929
  hem_column = self._get_hem_column(search_keys)
1958
1930
  email_converted_to_hem = False
1959
1931
  if email_column:
1960
- converter = EmailSearchKeyConverter(
1961
- email_column, hem_column, search_keys, list(unnest_search_keys.keys()), self.logger
1962
- )
1932
+ converter = EmailSearchKeyConverter(email_column, hem_column, search_keys, self.logger)
1963
1933
  df = converter.convert(df)
1964
1934
  generated_features.extend(converter.generated_features)
1965
1935
  email_converted_to_hem = converter.email_converted_to_hem
@@ -1973,21 +1943,30 @@ class FeaturesEnricher(TransformerMixin):
1973
1943
  generated_features = [f for f in generated_features if f in self.fit_generated_features]
1974
1944
 
1975
1945
  meaning_types = {col: key.value for col, key in search_keys.items()}
1976
- # non_keys_columns = [column for column in df.columns if column not in search_keys.keys()]
1977
- for col in original_features_for_transform:
1978
- meaning_types[col] = FileColumnMeaningType.FEATURE
1979
- features_not_to_pass = [column for column in features_not_to_pass if column not in search_keys.keys()]
1946
+ non_keys_columns = [column for column in df.columns if column not in search_keys.keys()]
1980
1947
 
1981
1948
  if email_converted_to_hem:
1982
- features_not_to_pass.append(email_column)
1949
+ non_keys_columns.append(email_column)
1950
+
1951
+ # Don't pass features in backend on transform
1952
+ original_features_for_transform = None
1953
+ runtime_parameters = self._get_copy_of_runtime_parameters()
1954
+ if len(non_keys_columns) > 0:
1955
+ # Pass only features that need for transform
1956
+ features_for_transform = self._search_task.get_features_for_transform()
1957
+ if features_for_transform is not None and len(features_for_transform) > 0:
1958
+ file_metadata = self._search_task.get_file_metadata(trace_id)
1959
+ original_features_for_transform = [
1960
+ c.originalName or c.name for c in file_metadata.columns if c.name in features_for_transform
1961
+ ]
1962
+ non_keys_columns = [c for c in non_keys_columns if c not in original_features_for_transform]
1983
1963
 
1984
- features_not_to_pass = [c for c in features_not_to_pass if c not in original_features_for_transform]
1985
- columns_for_system_record_id = sorted(list(search_keys.keys()) + (original_features_for_transform))
1964
+ runtime_parameters.properties["features_for_embeddings"] = ",".join(features_for_transform)
1986
1965
 
1987
1966
  if add_fit_system_record_id:
1988
- df = self.__add_fit_system_record_id(df, {}, search_keys)
1967
+ df = self.__add_fit_system_record_id(df, dict(), search_keys)
1989
1968
  df = df.rename(columns={SYSTEM_RECORD_ID: SORT_ID})
1990
- features_not_to_pass.append(SORT_ID)
1969
+ non_keys_columns.append(SORT_ID)
1991
1970
 
1992
1971
  columns_for_system_record_id = sorted(list(search_keys.keys()) + (original_features_for_transform or []))
1993
1972
 
@@ -1995,19 +1974,16 @@ class FeaturesEnricher(TransformerMixin):
1995
1974
  "Float64"
1996
1975
  )
1997
1976
  meaning_types[SYSTEM_RECORD_ID] = FileColumnMeaningType.SYSTEM_RECORD_ID
1998
- meaning_types[ENTITY_SYSTEM_RECORD_ID] = FileColumnMeaningType.ENTITY_SYSTEM_RECORD_ID
1999
- if SEARCH_KEY_UNNEST in df.columns:
2000
- meaning_types[SEARCH_KEY_UNNEST] = FileColumnMeaningType.UNNEST_KEY
2001
1977
 
2002
1978
  df = df.reset_index(drop=True)
2003
- system_columns_with_original_index = [SYSTEM_RECORD_ID, ENTITY_SYSTEM_RECORD_ID] + generated_features
1979
+ system_columns_with_original_index = [SYSTEM_RECORD_ID] + generated_features
2004
1980
  if add_fit_system_record_id:
2005
1981
  system_columns_with_original_index.append(SORT_ID)
2006
1982
  df_with_original_index = df[system_columns_with_original_index].copy()
2007
1983
 
2008
1984
  combined_search_keys = combine_search_keys(search_keys.keys())
2009
1985
 
2010
- df_without_features = df.drop(columns=features_not_to_pass)
1986
+ df_without_features = df.drop(columns=non_keys_columns)
2011
1987
 
2012
1988
  df_without_features = clean_full_duplicates(
2013
1989
  df_without_features, self.logger, silent=silent_mode, bundle=self.bundle
@@ -2019,13 +1995,12 @@ class FeaturesEnricher(TransformerMixin):
2019
1995
  dataset = Dataset(
2020
1996
  "sample_" + str(uuid.uuid4()),
2021
1997
  df=df_without_features,
2022
- meaning_types=meaning_types,
2023
- search_keys=combined_search_keys,
2024
- unnest_search_keys=unnest_search_keys,
2025
1998
  date_format=self.date_format,
2026
1999
  rest_client=self.rest_client,
2027
2000
  logger=self.logger,
2028
2001
  )
2002
+ dataset.meaning_types = meaning_types
2003
+ dataset.search_keys = combined_search_keys
2029
2004
  if email_converted_to_hem:
2030
2005
  dataset.ignore_columns = [email_column]
2031
2006
 
@@ -2164,14 +2139,6 @@ class FeaturesEnricher(TransformerMixin):
2164
2139
 
2165
2140
  key_types = search_keys.values()
2166
2141
 
2167
- # Multiple search keys allowed only for PHONE, IP, POSTAL_CODE, EMAIL, HEM
2168
- multi_keys = [key for key, count in Counter(key_types).items() if count > 1]
2169
- for multi_key in multi_keys:
2170
- if multi_key not in [SearchKey.PHONE, SearchKey.IP, SearchKey.POSTAL_CODE, SearchKey.EMAIL, SearchKey.HEM]:
2171
- msg = self.bundle.get("unsupported_multi_key").format(multi_key)
2172
- self.logger.warning(msg)
2173
- raise ValidationError(msg)
2174
-
2175
2142
  if SearchKey.DATE in key_types and SearchKey.DATETIME in key_types:
2176
2143
  msg = self.bundle.get("date_and_datetime_simultanious")
2177
2144
  self.logger.warning(msg)
@@ -2187,11 +2154,11 @@ class FeaturesEnricher(TransformerMixin):
2187
2154
  self.logger.warning(msg)
2188
2155
  raise ValidationError(msg)
2189
2156
 
2190
- # for key_type in SearchKey.__members__.values():
2191
- # if key_type != SearchKey.CUSTOM_KEY and list(key_types).count(key_type) > 1:
2192
- # msg = self.bundle.get("multiple_search_key").format(key_type)
2193
- # self.logger.warning(msg)
2194
- # raise ValidationError(msg)
2157
+ for key_type in SearchKey.__members__.values():
2158
+ if key_type != SearchKey.CUSTOM_KEY and list(key_types).count(key_type) > 1:
2159
+ msg = self.bundle.get("multiple_search_key").format(key_type)
2160
+ self.logger.warning(msg)
2161
+ raise ValidationError(msg)
2195
2162
 
2196
2163
  # non_personal_keys = set(SearchKey.__members__.values()) - set(SearchKey.personal_keys())
2197
2164
  # if (
@@ -2329,6 +2296,14 @@ class FeaturesEnricher(TransformerMixin):
2329
2296
  self.logger.info("Input dataset hasn't date column")
2330
2297
  if self.add_date_if_missing:
2331
2298
  df = self._add_current_date_as_key(df, self.fit_search_keys, self.logger, self.bundle)
2299
+ email_column = self._get_email_column(self.fit_search_keys)
2300
+ hem_column = self._get_hem_column(self.fit_search_keys)
2301
+ email_converted_to_hem = False
2302
+ if email_column:
2303
+ converter = EmailSearchKeyConverter(email_column, hem_column, self.fit_search_keys, self.logger)
2304
+ df = converter.convert(df)
2305
+ self.fit_generated_features.extend(converter.generated_features)
2306
+ email_converted_to_hem = converter.email_converted_to_hem
2332
2307
  if (
2333
2308
  self.detect_missing_search_keys
2334
2309
  and list(self.fit_search_keys.values()) == [SearchKey.DATE]
@@ -2337,37 +2312,7 @@ class FeaturesEnricher(TransformerMixin):
2337
2312
  converter = IpToCountrySearchKeyConverter(self.fit_search_keys, self.logger)
2338
2313
  df = converter.convert(df)
2339
2314
 
2340
- # Explode multiple search keys
2341
2315
  non_feature_columns = [self.TARGET_NAME, EVAL_SET_INDEX] + list(self.fit_search_keys.keys())
2342
- meaning_types = {
2343
- **{col: key.value for col, key in self.fit_search_keys.items()},
2344
- **{str(c): FileColumnMeaningType.FEATURE for c in df.columns if c not in non_feature_columns},
2345
- }
2346
- meaning_types[self.TARGET_NAME] = FileColumnMeaningType.TARGET
2347
- if eval_set is not None and len(eval_set) > 0:
2348
- meaning_types[EVAL_SET_INDEX] = FileColumnMeaningType.EVAL_SET_INDEX
2349
- df = self.__add_fit_system_record_id(df, meaning_types, self.fit_search_keys, ENTITY_SYSTEM_RECORD_ID)
2350
-
2351
- # TODO check that this is correct for enrichment
2352
- self.df_with_original_index = df.copy()
2353
-
2354
- df, unnest_search_keys = self._explode_multiple_search_keys(df, self.fit_search_keys)
2355
-
2356
- # Convert EMAIL to HEM after unnesting to do it only with one column
2357
- email_column = self._get_email_column(self.fit_search_keys)
2358
- hem_column = self._get_hem_column(self.fit_search_keys)
2359
- email_converted_to_hem = False
2360
- if email_column:
2361
- converter = EmailSearchKeyConverter(
2362
- email_column, hem_column, self.fit_search_keys, list(unnest_search_keys.keys()), self.logger
2363
- )
2364
- df = converter.convert(df)
2365
- self.fit_generated_features.extend(converter.generated_features)
2366
- email_converted_to_hem = converter.email_converted_to_hem
2367
-
2368
- non_feature_columns = [self.TARGET_NAME, EVAL_SET_INDEX, ENTITY_SYSTEM_RECORD_ID, SEARCH_KEY_UNNEST] + list(
2369
- self.fit_search_keys.keys()
2370
- )
2371
2316
  if email_converted_to_hem:
2372
2317
  non_feature_columns.append(email_column)
2373
2318
  if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
@@ -2391,14 +2336,12 @@ class FeaturesEnricher(TransformerMixin):
2391
2336
  **{str(c): FileColumnMeaningType.FEATURE for c in df.columns if c not in non_feature_columns},
2392
2337
  }
2393
2338
  meaning_types[self.TARGET_NAME] = FileColumnMeaningType.TARGET
2394
- meaning_types[ENTITY_SYSTEM_RECORD_ID] = FileColumnMeaningType.ENTITY_SYSTEM_RECORD_ID
2395
- if SEARCH_KEY_UNNEST in df.columns:
2396
- meaning_types[SEARCH_KEY_UNNEST] = FileColumnMeaningType.UNNEST_KEY
2397
2339
  if eval_set is not None and len(eval_set) > 0:
2398
2340
  meaning_types[EVAL_SET_INDEX] = FileColumnMeaningType.EVAL_SET_INDEX
2399
2341
 
2400
- df = self.__add_fit_system_record_id(df, meaning_types, self.fit_search_keys, SYSTEM_RECORD_ID)
2342
+ df = self.__add_fit_system_record_id(df, meaning_types, self.fit_search_keys)
2401
2343
 
2344
+ self.df_with_original_index = df.copy()
2402
2345
  df = df.reset_index(drop=True).sort_values(by=SYSTEM_RECORD_ID).reset_index(drop=True)
2403
2346
 
2404
2347
  combined_search_keys = combine_search_keys(self.fit_search_keys.keys())
@@ -2406,15 +2349,14 @@ class FeaturesEnricher(TransformerMixin):
2406
2349
  dataset = Dataset(
2407
2350
  "tds_" + str(uuid.uuid4()),
2408
2351
  df=df,
2409
- meaning_types=meaning_types,
2410
- search_keys=combined_search_keys,
2411
- unnest_search_keys=unnest_search_keys,
2412
2352
  model_task_type=model_task_type,
2413
2353
  date_format=self.date_format,
2414
2354
  random_state=self.random_state,
2415
2355
  rest_client=self.rest_client,
2416
2356
  logger=self.logger,
2417
2357
  )
2358
+ dataset.meaning_types = meaning_types
2359
+ dataset.search_keys = combined_search_keys
2418
2360
  if email_converted_to_hem:
2419
2361
  dataset.ignore_columns = [email_column]
2420
2362
 
@@ -2606,7 +2548,7 @@ class FeaturesEnricher(TransformerMixin):
2606
2548
  validated_X = X.copy()
2607
2549
  elif isinstance(X, pd.Series):
2608
2550
  validated_X = X.to_frame()
2609
- elif isinstance(X, np.ndarray) or isinstance(X, list):
2551
+ elif isinstance(X, (list, np.ndarray)):
2610
2552
  validated_X = pd.DataFrame(X)
2611
2553
  renaming = {c: str(c) for c in validated_X.columns}
2612
2554
  validated_X = validated_X.rename(columns=renaming)
@@ -2695,7 +2637,7 @@ class FeaturesEnricher(TransformerMixin):
2695
2637
  validated_eval_X = eval_X.copy()
2696
2638
  elif isinstance(eval_X, pd.Series):
2697
2639
  validated_eval_X = eval_X.to_frame()
2698
- elif isinstance(eval_X, np.ndarray) or isinstance(eval_X, list):
2640
+ elif isinstance(eval_X, (list, np.ndarray)):
2699
2641
  validated_eval_X = pd.DataFrame(eval_X)
2700
2642
  renaming = {c: str(c) for c in validated_eval_X.columns}
2701
2643
  validated_eval_X = validated_eval_X.rename(columns=renaming)
@@ -2784,10 +2726,9 @@ class FeaturesEnricher(TransformerMixin):
2784
2726
  X: pd.DataFrame, y: pd.Series, cv: Optional[CVType]
2785
2727
  ) -> Tuple[pd.DataFrame, pd.Series]:
2786
2728
  if cv not in [CVType.time_series, CVType.blocked_time_series]:
2787
- record_id_column = ENTITY_SYSTEM_RECORD_ID if ENTITY_SYSTEM_RECORD_ID in X else SYSTEM_RECORD_ID
2788
2729
  Xy = X.copy()
2789
2730
  Xy[TARGET] = y
2790
- Xy = Xy.sort_values(by=record_id_column).reset_index(drop=True)
2731
+ Xy = Xy.sort_values(by=SYSTEM_RECORD_ID).reset_index(drop=True)
2791
2732
  X = Xy.drop(columns=TARGET)
2792
2733
  y = Xy[TARGET].copy()
2793
2734
 
@@ -2878,7 +2819,7 @@ class FeaturesEnricher(TransformerMixin):
2878
2819
  )
2879
2820
 
2880
2821
  def sample(df):
2881
- if isinstance(df, pd.Series) or isinstance(df, pd.DataFrame):
2822
+ if isinstance(df, (pd.DataFrame, pd.Series)):
2882
2823
  return df.head(10)
2883
2824
  else:
2884
2825
  return df[:10]
@@ -2964,19 +2905,15 @@ class FeaturesEnricher(TransformerMixin):
2964
2905
 
2965
2906
  @staticmethod
2966
2907
  def _get_email_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
2967
- cols = [col for col, t in search_keys.items() if t == SearchKey.EMAIL]
2968
- if len(cols) > 1:
2969
- raise Exception("More than one email column found after unnest")
2970
- if len(cols) == 1:
2971
- return cols[0]
2908
+ for col, t in search_keys.items():
2909
+ if t == SearchKey.EMAIL:
2910
+ return col
2972
2911
 
2973
2912
  @staticmethod
2974
2913
  def _get_hem_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
2975
- cols = [col for col, t in search_keys.items() if t == SearchKey.HEM]
2976
- if len(cols) > 1:
2977
- raise Exception("More than one hem column found after unnest")
2978
- if len(cols) == 1:
2979
- return cols[0]
2914
+ for col, t in search_keys.items():
2915
+ if t == SearchKey.HEM:
2916
+ return col
2980
2917
 
2981
2918
  @staticmethod
2982
2919
  def _get_phone_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
@@ -2984,44 +2921,8 @@ class FeaturesEnricher(TransformerMixin):
2984
2921
  if t == SearchKey.PHONE:
2985
2922
  return col
2986
2923
 
2987
- def _explode_multiple_search_keys(
2988
- self, df: pd.DataFrame, search_keys: Dict[str, SearchKey]
2989
- ) -> Tuple[pd.DataFrame, Dict[str, List[str]]]:
2990
- # find groups of multiple search keys
2991
- search_key_names_by_type: Dict[SearchKey, str] = {}
2992
- for key_name, key_type in search_keys.items():
2993
- search_key_names_by_type[key_type] = search_key_names_by_type.get(key_type, []) + [key_name]
2994
- search_key_names_by_type = {
2995
- key_type: key_names for key_type, key_names in search_key_names_by_type.items() if len(key_names) > 1
2996
- }
2997
- if len(search_key_names_by_type) == 0:
2998
- return df, {}
2999
-
3000
- multiple_keys_columns = [col for cols in search_key_names_by_type.values() for col in cols]
3001
- other_columns = [col for col in df.columns if col not in multiple_keys_columns]
3002
- exploded_dfs = []
3003
- unnest_search_keys = {}
3004
-
3005
- for key_type, key_names in search_key_names_by_type.items():
3006
- new_search_key = f"upgini_{key_type.name.lower()}_unnest"
3007
- exploded_df = pd.melt(
3008
- df, id_vars=other_columns, value_vars=key_names, var_name=SEARCH_KEY_UNNEST, value_name=new_search_key
3009
- )
3010
- exploded_dfs.append(exploded_df)
3011
- for old_key in key_names:
3012
- del search_keys[old_key]
3013
- search_keys[new_search_key] = key_type
3014
- unnest_search_keys[new_search_key] = key_names
3015
-
3016
- df = pd.concat(exploded_dfs, ignore_index=True)
3017
- return df, unnest_search_keys
3018
-
3019
2924
  def __add_fit_system_record_id(
3020
- self,
3021
- df: pd.DataFrame,
3022
- meaning_types: Dict[str, FileColumnMeaningType],
3023
- search_keys: Dict[str, SearchKey],
3024
- id_name: str,
2925
+ self, df: pd.DataFrame, meaning_types: Dict[str, FileColumnMeaningType], search_keys: Dict[str, SearchKey]
3025
2926
  ) -> pd.DataFrame:
3026
2927
  # save original order or rows
3027
2928
  original_index_name = df.index.name
@@ -3070,18 +2971,14 @@ class FeaturesEnricher(TransformerMixin):
3070
2971
 
3071
2972
  df = df.reset_index(drop=True).reset_index()
3072
2973
  # system_record_id saves correct order for fit
3073
- df = df.rename(columns={DEFAULT_INDEX: id_name})
2974
+ df = df.rename(columns={DEFAULT_INDEX: SYSTEM_RECORD_ID})
3074
2975
 
3075
2976
  # return original order
3076
2977
  df = df.set_index(ORIGINAL_INDEX)
3077
2978
  df.index.name = original_index_name
3078
2979
  df = df.sort_values(by=original_order_name).drop(columns=original_order_name)
3079
2980
 
3080
- meaning_types[id_name] = (
3081
- FileColumnMeaningType.SYSTEM_RECORD_ID
3082
- if id_name == SYSTEM_RECORD_ID
3083
- else FileColumnMeaningType.ENTITY_SYSTEM_RECORD_ID
3084
- )
2981
+ meaning_types[SYSTEM_RECORD_ID] = FileColumnMeaningType.SYSTEM_RECORD_ID
3085
2982
  return df
3086
2983
 
3087
2984
  def __correct_target(self, df: pd.DataFrame) -> pd.DataFrame:
@@ -3136,11 +3033,7 @@ class FeaturesEnricher(TransformerMixin):
3136
3033
  )
3137
3034
 
3138
3035
  comparing_columns = X.columns if is_transform else df_with_original_index.columns
3139
- dup_features = [
3140
- c
3141
- for c in comparing_columns
3142
- if c in result_features.columns and c not in [SYSTEM_RECORD_ID, ENTITY_SYSTEM_RECORD_ID]
3143
- ]
3036
+ dup_features = [c for c in comparing_columns if c in result_features.columns and c != SYSTEM_RECORD_ID]
3144
3037
  if len(dup_features) > 0:
3145
3038
  self.logger.warning(f"X contain columns with same name as returned from backend: {dup_features}")
3146
3039
  raise ValidationError(self.bundle.get("returned_features_same_as_passed").format(dup_features))
@@ -3151,7 +3044,8 @@ class FeaturesEnricher(TransformerMixin):
3151
3044
  result_features = pd.merge(
3152
3045
  df_with_original_index,
3153
3046
  result_features,
3154
- on=ENTITY_SYSTEM_RECORD_ID,
3047
+ left_on=SYSTEM_RECORD_ID,
3048
+ right_on=SYSTEM_RECORD_ID,
3155
3049
  how="left" if is_transform else "inner",
3156
3050
  )
3157
3051
  result_features = result_features.set_index(original_index_name or DEFAULT_INDEX)
@@ -3162,7 +3056,7 @@ class FeaturesEnricher(TransformerMixin):
3162
3056
  result_features = result_features[~result_features[SYSTEM_RECORD_ID].isin(rows_to_drop[SYSTEM_RECORD_ID])]
3163
3057
  self.logger.info(f"After dropping target outliers size: {len(result_features)}")
3164
3058
 
3165
- result_eval_sets = {}
3059
+ result_eval_sets = dict()
3166
3060
  if not is_transform and EVAL_SET_INDEX in result_features.columns:
3167
3061
  result_train_features = result_features.loc[result_features[EVAL_SET_INDEX] == 0].copy()
3168
3062
  eval_set_indices = list(result_features[EVAL_SET_INDEX].unique())
@@ -3368,7 +3262,7 @@ class FeaturesEnricher(TransformerMixin):
3368
3262
  if autofe_feature.op.is_vector:
3369
3263
  continue
3370
3264
 
3371
- description = {}
3265
+ description = dict()
3372
3266
 
3373
3267
  feature_meta = get_feature_by_name(autofe_feature.get_display_name(shorten=True))
3374
3268
  if feature_meta is None:
@@ -3534,13 +3428,13 @@ class FeaturesEnricher(TransformerMixin):
3534
3428
  self.warning_counter.increment()
3535
3429
 
3536
3430
  if len(valid_search_keys) == 1:
3537
- key, value = list(valid_search_keys.items())[0]
3538
- # Show warning for country only if country is the only key
3539
- if x[key].nunique() == 1:
3540
- msg = self.bundle.get("single_constant_search_key").format(value, x[key].values[0])
3541
- print(msg)
3542
- self.logger.warning(msg)
3543
- self.warning_counter.increment()
3431
+ for k, v in valid_search_keys.items():
3432
+ # Show warning for country only if country is the only key
3433
+ if x[k].nunique() == 1 and (v != SearchKey.COUNTRY or len(valid_search_keys) == 1):
3434
+ msg = self.bundle.get("single_constant_search_key").format(v, x[k].values[0])
3435
+ print(msg)
3436
+ self.logger.warning(msg)
3437
+ self.warning_counter.increment()
3544
3438
 
3545
3439
  self.logger.info(f"Prepared search keys: {valid_search_keys}")
3546
3440
 
@@ -3650,68 +3544,61 @@ class FeaturesEnricher(TransformerMixin):
3650
3544
  def check_need_detect(search_key: SearchKey):
3651
3545
  return not is_transform or search_key in self.fit_search_keys.values()
3652
3546
 
3653
- # if SearchKey.POSTAL_CODE not in search_keys.values() and check_need_detect(SearchKey.POSTAL_CODE):
3654
- if check_need_detect(SearchKey.POSTAL_CODE):
3655
- maybe_keys = PostalCodeSearchKeyDetector().get_search_key_columns(sample, search_keys)
3656
- if maybe_keys:
3657
- new_keys = {key: SearchKey.POSTAL_CODE for key in maybe_keys}
3658
- search_keys.update(new_keys)
3659
- self.autodetected_search_keys.update(new_keys)
3660
- self.logger.info(f"Autodetected search key POSTAL_CODE in column {maybe_keys}")
3547
+ if SearchKey.POSTAL_CODE not in search_keys.values() and check_need_detect(SearchKey.POSTAL_CODE):
3548
+ maybe_key = PostalCodeSearchKeyDetector().get_search_key_column(sample)
3549
+ if maybe_key is not None:
3550
+ search_keys[maybe_key] = SearchKey.POSTAL_CODE
3551
+ self.autodetected_search_keys[maybe_key] = SearchKey.POSTAL_CODE
3552
+ self.logger.info(f"Autodetected search key POSTAL_CODE in column {maybe_key}")
3661
3553
  if not silent_mode:
3662
- print(self.bundle.get("postal_code_detected").format(maybe_keys))
3554
+ print(self.bundle.get("postal_code_detected").format(maybe_key))
3663
3555
 
3664
3556
  if (
3665
3557
  SearchKey.COUNTRY not in search_keys.values()
3666
3558
  and self.country_code is None
3667
3559
  and check_need_detect(SearchKey.COUNTRY)
3668
3560
  ):
3669
- maybe_key = CountrySearchKeyDetector().get_search_key_columns(sample, search_keys)
3670
- if maybe_key:
3671
- search_keys[maybe_key[0]] = SearchKey.COUNTRY
3672
- self.autodetected_search_keys[maybe_key[0]] = SearchKey.COUNTRY
3561
+ maybe_key = CountrySearchKeyDetector().get_search_key_column(sample)
3562
+ if maybe_key is not None:
3563
+ search_keys[maybe_key] = SearchKey.COUNTRY
3564
+ self.autodetected_search_keys[maybe_key] = SearchKey.COUNTRY
3673
3565
  self.logger.info(f"Autodetected search key COUNTRY in column {maybe_key}")
3674
3566
  if not silent_mode:
3675
3567
  print(self.bundle.get("country_detected").format(maybe_key))
3676
3568
 
3677
3569
  if (
3678
- # SearchKey.EMAIL not in search_keys.values()
3679
- SearchKey.HEM not in search_keys.values()
3570
+ SearchKey.EMAIL not in search_keys.values()
3571
+ and SearchKey.HEM not in search_keys.values()
3680
3572
  and check_need_detect(SearchKey.HEM)
3681
3573
  ):
3682
- maybe_keys = EmailSearchKeyDetector().get_search_key_columns(sample, search_keys)
3683
- if maybe_keys:
3574
+ maybe_key = EmailSearchKeyDetector().get_search_key_column(sample)
3575
+ if maybe_key is not None and maybe_key not in search_keys.keys():
3684
3576
  if self.__is_registered or is_demo_dataset:
3685
- new_keys = {key: SearchKey.EMAIL for key in maybe_keys}
3686
- search_keys.update(new_keys)
3687
- self.autodetected_search_keys.update(new_keys)
3688
- self.logger.info(f"Autodetected search key EMAIL in column {maybe_keys}")
3577
+ search_keys[maybe_key] = SearchKey.EMAIL
3578
+ self.autodetected_search_keys[maybe_key] = SearchKey.EMAIL
3579
+ self.logger.info(f"Autodetected search key EMAIL in column {maybe_key}")
3689
3580
  if not silent_mode:
3690
- print(self.bundle.get("email_detected").format(maybe_keys))
3581
+ print(self.bundle.get("email_detected").format(maybe_key))
3691
3582
  else:
3692
3583
  self.logger.warning(
3693
- f"Autodetected search key EMAIL in column {maybe_keys}."
3694
- " But not used because not registered user"
3584
+ f"Autodetected search key EMAIL in column {maybe_key}. But not used because not registered user"
3695
3585
  )
3696
3586
  if not silent_mode:
3697
- print(self.bundle.get("email_detected_not_registered").format(maybe_keys))
3587
+ print(self.bundle.get("email_detected_not_registered").format(maybe_key))
3698
3588
  self.warning_counter.increment()
3699
3589
 
3700
- # if SearchKey.PHONE not in search_keys.values() and check_need_detect(SearchKey.PHONE):
3701
- if check_need_detect(SearchKey.PHONE):
3702
- maybe_keys = PhoneSearchKeyDetector().get_search_key_columns(sample, search_keys)
3703
- if maybe_keys:
3590
+ if SearchKey.PHONE not in search_keys.values() and check_need_detect(SearchKey.PHONE):
3591
+ maybe_key = PhoneSearchKeyDetector().get_search_key_column(sample)
3592
+ if maybe_key is not None and maybe_key not in search_keys.keys():
3704
3593
  if self.__is_registered or is_demo_dataset:
3705
- new_keys = {key: SearchKey.PHONE for key in maybe_keys}
3706
- search_keys.update(new_keys)
3707
- self.autodetected_search_keys.update(new_keys)
3708
- self.logger.info(f"Autodetected search key PHONE in column {maybe_keys}")
3594
+ search_keys[maybe_key] = SearchKey.PHONE
3595
+ self.autodetected_search_keys[maybe_key] = SearchKey.PHONE
3596
+ self.logger.info(f"Autodetected search key PHONE in column {maybe_key}")
3709
3597
  if not silent_mode:
3710
- print(self.bundle.get("phone_detected").format(maybe_keys))
3598
+ print(self.bundle.get("phone_detected").format(maybe_key))
3711
3599
  else:
3712
3600
  self.logger.warning(
3713
- f"Autodetected search key PHONE in column {maybe_keys}. "
3714
- "But not used because not registered user"
3601
+ f"Autodetected search key PHONE in column {maybe_key}. But not used because not registered user"
3715
3602
  )
3716
3603
  if not silent_mode:
3717
3604
  print(self.bundle.get("phone_detected_not_registered"))
@@ -3806,7 +3693,7 @@ class FeaturesEnricher(TransformerMixin):
3806
3693
  def sample(inp, sample_index):
3807
3694
  if _num_samples(inp) <= 1000:
3808
3695
  return inp
3809
- if isinstance(inp, pd.DataFrame) or isinstance(inp, pd.Series):
3696
+ if isinstance(inp, (pd.DataFrame, pd.Series)):
3810
3697
  return inp.sample(n=1000, random_state=random_state)
3811
3698
  if isinstance(inp, np.ndarray):
3812
3699
  return inp[sample_index]