upgini 1.1.275a1__py3-none-any.whl → 1.1.276__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

@@ -1,4 +1,5 @@
1
1
  import dataclasses
2
+ import datetime
2
3
  import gc
3
4
  import hashlib
4
5
  import itertools
@@ -10,7 +11,6 @@ import sys
10
11
  import tempfile
11
12
  import time
12
13
  import uuid
13
- from collections import Counter
14
14
  from dataclasses import dataclass
15
15
  from threading import Thread
16
16
  from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union
@@ -21,6 +21,7 @@ from pandas.api.types import (
21
21
  is_bool,
22
22
  is_datetime64_any_dtype,
23
23
  is_numeric_dtype,
24
+ is_object_dtype,
24
25
  is_period_dtype,
25
26
  is_string_dtype,
26
27
  )
@@ -44,11 +45,9 @@ from upgini.mdc import MDC
44
45
  from upgini.metadata import (
45
46
  COUNTRY,
46
47
  DEFAULT_INDEX,
47
- ENTITY_SYSTEM_RECORD_ID,
48
48
  EVAL_SET_INDEX,
49
49
  ORIGINAL_INDEX,
50
50
  RENAMED_INDEX,
51
- SEARCH_KEY_UNNEST,
52
51
  SORT_ID,
53
52
  SYSTEM_RECORD_ID,
54
53
  TARGET,
@@ -149,6 +148,7 @@ class FeaturesEnricher(TransformerMixin):
149
148
  """
150
149
 
151
150
  TARGET_NAME = "target"
151
+ CURRENT_DATE = "current_date"
152
152
  RANDOM_STATE = 42
153
153
  CALCULATE_METRICS_THRESHOLD = 50_000_000
154
154
  CALCULATE_METRICS_MIN_THRESHOLD = 500
@@ -210,6 +210,7 @@ class FeaturesEnricher(TransformerMixin):
210
210
  client_ip: Optional[str] = None,
211
211
  client_visitorid: Optional[str] = None,
212
212
  custom_bundle_config: Optional[str] = None,
213
+ add_date_if_missing: bool = True,
213
214
  **kwargs,
214
215
  ):
215
216
  self.bundle = get_custom_bundle(custom_bundle_config)
@@ -320,6 +321,7 @@ class FeaturesEnricher(TransformerMixin):
320
321
  self.raise_validation_error = raise_validation_error
321
322
  self.exclude_columns = exclude_columns
322
323
  self.baseline_score_column = baseline_score_column
324
+ self.add_date_if_missing = add_date_if_missing
323
325
 
324
326
  def _get_api_key(self):
325
327
  return self._api_key
@@ -423,6 +425,9 @@ class FeaturesEnricher(TransformerMixin):
423
425
 
424
426
  self.__validate_search_keys(self.search_keys, self.search_id)
425
427
 
428
+ # Validate client estimator params
429
+ self._get_client_cat_features(estimator, X, self.search_keys)
430
+
426
431
  try:
427
432
  self.X = X
428
433
  self.y = y
@@ -816,6 +821,7 @@ class FeaturesEnricher(TransformerMixin):
816
821
  trace_id = trace_id or str(uuid.uuid4())
817
822
  start_time = time.time()
818
823
  with MDC(trace_id=trace_id):
824
+ self.logger.info("Start calculate metrics")
819
825
  if len(args) > 0:
820
826
  msg = f"WARNING: Unsupported positional arguments for calculate_metrics: {args}"
821
827
  self.logger.warning(msg)
@@ -867,22 +873,9 @@ class FeaturesEnricher(TransformerMixin):
867
873
  self.__display_support_link(msg)
868
874
  return None
869
875
 
870
- cat_features = None
871
- search_keys_for_metrics = []
872
- if (
873
- estimator is not None
874
- and hasattr(estimator, "get_param")
875
- and estimator.get_param("cat_features") is not None
876
- ):
877
- cat_features = estimator.get_param("cat_features")
878
- if len(cat_features) > 0 and isinstance(cat_features[0], int):
879
- cat_features = [effective_X.columns[i] for i in cat_features]
880
- for cat_feature in cat_features:
881
- if cat_feature in self.search_keys:
882
- if self.search_keys[cat_feature] in [SearchKey.COUNTRY, SearchKey.POSTAL_CODE]:
883
- search_keys_for_metrics.append(cat_feature)
884
- else:
885
- raise ValidationError(self.bundle.get("cat_feature_search_key").format(cat_feature))
876
+ cat_features, search_keys_for_metrics = self._get_client_cat_features(
877
+ estimator, effective_X, self.search_keys
878
+ )
886
879
 
887
880
  prepared_data = self._prepare_data_for_metrics(
888
881
  trace_id=trace_id,
@@ -897,6 +890,7 @@ class FeaturesEnricher(TransformerMixin):
897
890
  search_keys_for_metrics=search_keys_for_metrics,
898
891
  progress_bar=progress_bar,
899
892
  progress_callback=progress_callback,
893
+ cat_features=cat_features,
900
894
  )
901
895
  if prepared_data is None:
902
896
  return None
@@ -1184,8 +1178,6 @@ class FeaturesEnricher(TransformerMixin):
1184
1178
  search_keys = self.search_keys.copy()
1185
1179
  search_keys = self.__prepare_search_keys(x, search_keys, is_demo_dataset, is_transform=True, silent_mode=True)
1186
1180
 
1187
- unnest_search_keys = []
1188
-
1189
1181
  extended_X = x.copy()
1190
1182
  generated_features = []
1191
1183
  date_column = self._get_date_column(search_keys)
@@ -1196,7 +1188,7 @@ class FeaturesEnricher(TransformerMixin):
1196
1188
  email_column = self._get_email_column(search_keys)
1197
1189
  hem_column = self._get_hem_column(search_keys)
1198
1190
  if email_column:
1199
- converter = EmailSearchKeyConverter(email_column, hem_column, search_keys, unnest_search_keys, self.logger)
1191
+ converter = EmailSearchKeyConverter(email_column, hem_column, search_keys, self.logger)
1200
1192
  extended_X = converter.convert(extended_X)
1201
1193
  generated_features.extend(converter.generated_features)
1202
1194
  if (
@@ -1274,6 +1266,29 @@ class FeaturesEnricher(TransformerMixin):
1274
1266
 
1275
1267
  return _cv, groups
1276
1268
 
1269
+ def _get_client_cat_features(
1270
+ self, estimator: Optional[Any], X: pd.DataFrame, search_keys: Dict[str, SearchKey]
1271
+ ) -> Optional[List[str]]:
1272
+ cat_features = None
1273
+ search_keys_for_metrics = []
1274
+ if (
1275
+ estimator is not None
1276
+ and hasattr(estimator, "get_param")
1277
+ and estimator.get_param("cat_features") is not None
1278
+ ):
1279
+ cat_features = estimator.get_param("cat_features")
1280
+ if len(cat_features) > 0:
1281
+ if all([isinstance(f, int) for f in cat_features]):
1282
+ cat_features = [X.columns[i] for i in cat_features]
1283
+ self.logger.info(f"Collected categorical features {cat_features} from user estimator")
1284
+ for cat_feature in cat_features:
1285
+ if cat_feature in search_keys:
1286
+ if search_keys[cat_feature] in [SearchKey.COUNTRY, SearchKey.POSTAL_CODE]:
1287
+ search_keys_for_metrics.append(cat_feature)
1288
+ else:
1289
+ raise ValidationError(self.bundle.get("cat_feature_search_key").format(cat_feature))
1290
+ return cat_features, search_keys_for_metrics
1291
+
1277
1292
  def _prepare_data_for_metrics(
1278
1293
  self,
1279
1294
  trace_id: str,
@@ -1288,6 +1303,7 @@ class FeaturesEnricher(TransformerMixin):
1288
1303
  search_keys_for_metrics: Optional[List[str]] = None,
1289
1304
  progress_bar: Optional[ProgressBar] = None,
1290
1305
  progress_callback: Optional[Callable[[SearchProgress], Any]] = None,
1306
+ cat_features: Optional[List[str]] = None,
1291
1307
  ):
1292
1308
  is_input_same_as_fit, X, y, eval_set = self._is_input_same_as_fit(X, y, eval_set)
1293
1309
  is_demo_dataset = hash_input(X, y, eval_set) in DEMO_DATASET_HASHES
@@ -1345,9 +1361,8 @@ class FeaturesEnricher(TransformerMixin):
1345
1361
 
1346
1362
  # Detect and drop high cardinality columns in train
1347
1363
  columns_with_high_cardinality = FeaturesValidator.find_high_cardinality(fitting_X)
1348
- columns_with_high_cardinality = [
1349
- c for c in columns_with_high_cardinality if c not in (self.generate_features or [])
1350
- ]
1364
+ non_excluding_columns = (self.generate_features or []) + (cat_features or [])
1365
+ columns_with_high_cardinality = [c for c in columns_with_high_cardinality if c not in non_excluding_columns]
1351
1366
  if len(columns_with_high_cardinality) > 0:
1352
1367
  self.logger.warning(
1353
1368
  f"High cardinality columns {columns_with_high_cardinality} will be dropped for metrics calculation"
@@ -1809,10 +1824,11 @@ class FeaturesEnricher(TransformerMixin):
1809
1824
  else:
1810
1825
  features_section = ""
1811
1826
 
1812
- api_example = f"""curl 'https://inference-upgini.azurewebsites.net/api/http_inference_trigger' \\
1827
+ search_id = self._search_task.search_task_id
1828
+ api_example = f"""curl 'https://search.upgini.com/online/api/http_inference_trigger?search_id={search_id}' \\
1813
1829
  -H 'Authorization: {self.api_key}' \\
1814
1830
  -H 'Content-Type: application/json' \\
1815
- -d '{{"search_id": "{self._search_task.search_task_id}", "search_keys": {keys}{features_section}}}'"""
1831
+ -d '{{"search_keys": {keys}{features_section}}}'"""
1816
1832
  return api_example
1817
1833
 
1818
1834
  def _get_copy_of_runtime_parameters(self) -> RuntimeParameters:
@@ -1907,38 +1923,13 @@ class FeaturesEnricher(TransformerMixin):
1907
1923
  generated_features.extend(converter.generated_features)
1908
1924
  else:
1909
1925
  self.logger.info("Input dataset hasn't date column")
1910
-
1911
- # Don't pass all features in backend on transform
1912
- original_features_for_transform = []
1913
- runtime_parameters = self._get_copy_of_runtime_parameters()
1914
- features_not_to_pass = [column for column in df.columns if column not in search_keys.keys()]
1915
- if len(features_not_to_pass) > 0:
1916
- # Pass only features that need for transform
1917
- features_for_transform = self._search_task.get_features_for_transform()
1918
- if features_for_transform is not None and len(features_for_transform) > 0:
1919
- file_metadata = self._search_task.get_file_metadata(trace_id)
1920
- original_features_for_transform = [
1921
- c.originalName or c.name for c in file_metadata.columns if c.name in features_for_transform
1922
- ]
1923
-
1924
- runtime_parameters.properties["features_for_embeddings"] = ",".join(features_for_transform)
1925
-
1926
- columns_for_system_record_id = sorted(list(search_keys.keys()) + (original_features_for_transform))
1927
-
1928
- df[ENTITY_SYSTEM_RECORD_ID] = pd.util.hash_pandas_object(
1929
- df[columns_for_system_record_id], index=False
1930
- ).astype("Float64")
1931
-
1932
- # Explode multiple search keys
1933
- df, unnest_search_keys = self._explode_multiple_search_keys(df, search_keys)
1934
-
1926
+ if self.add_date_if_missing:
1927
+ df = self._add_current_date_as_key(df, search_keys, self.logger, self.bundle)
1935
1928
  email_column = self._get_email_column(search_keys)
1936
1929
  hem_column = self._get_hem_column(search_keys)
1937
1930
  email_converted_to_hem = False
1938
1931
  if email_column:
1939
- converter = EmailSearchKeyConverter(
1940
- email_column, hem_column, search_keys, unnest_search_keys, self.logger
1941
- )
1932
+ converter = EmailSearchKeyConverter(email_column, hem_column, search_keys, self.logger)
1942
1933
  df = converter.convert(df)
1943
1934
  generated_features.extend(converter.generated_features)
1944
1935
  email_converted_to_hem = converter.email_converted_to_hem
@@ -1952,21 +1943,30 @@ class FeaturesEnricher(TransformerMixin):
1952
1943
  generated_features = [f for f in generated_features if f in self.fit_generated_features]
1953
1944
 
1954
1945
  meaning_types = {col: key.value for col, key in search_keys.items()}
1955
- # non_keys_columns = [column for column in df.columns if column not in search_keys.keys()]
1956
- for col in original_features_for_transform:
1957
- meaning_types[col] = FileColumnMeaningType.FEATURE
1958
- features_not_to_pass = [column for column in features_not_to_pass if column not in search_keys.keys()]
1946
+ non_keys_columns = [column for column in df.columns if column not in search_keys.keys()]
1959
1947
 
1960
1948
  if email_converted_to_hem:
1961
- features_not_to_pass.append(email_column)
1949
+ non_keys_columns.append(email_column)
1950
+
1951
+ # Don't pass features in backend on transform
1952
+ original_features_for_transform = None
1953
+ runtime_parameters = self._get_copy_of_runtime_parameters()
1954
+ if len(non_keys_columns) > 0:
1955
+ # Pass only features that need for transform
1956
+ features_for_transform = self._search_task.get_features_for_transform()
1957
+ if features_for_transform is not None and len(features_for_transform) > 0:
1958
+ file_metadata = self._search_task.get_file_metadata(trace_id)
1959
+ original_features_for_transform = [
1960
+ c.originalName or c.name for c in file_metadata.columns if c.name in features_for_transform
1961
+ ]
1962
+ non_keys_columns = [c for c in non_keys_columns if c not in original_features_for_transform]
1962
1963
 
1963
- features_not_to_pass = [c for c in features_not_to_pass if c not in original_features_for_transform]
1964
- columns_for_system_record_id = sorted(list(search_keys.keys()) + (original_features_for_transform))
1964
+ runtime_parameters.properties["features_for_embeddings"] = ",".join(features_for_transform)
1965
1965
 
1966
1966
  if add_fit_system_record_id:
1967
1967
  df = self.__add_fit_system_record_id(df, dict(), search_keys)
1968
1968
  df = df.rename(columns={SYSTEM_RECORD_ID: SORT_ID})
1969
- features_not_to_pass.append(SORT_ID)
1969
+ non_keys_columns.append(SORT_ID)
1970
1970
 
1971
1971
  columns_for_system_record_id = sorted(list(search_keys.keys()) + (original_features_for_transform or []))
1972
1972
 
@@ -1974,19 +1974,16 @@ class FeaturesEnricher(TransformerMixin):
1974
1974
  "Float64"
1975
1975
  )
1976
1976
  meaning_types[SYSTEM_RECORD_ID] = FileColumnMeaningType.SYSTEM_RECORD_ID
1977
- meaning_types[ENTITY_SYSTEM_RECORD_ID] = FileColumnMeaningType.ENTITY_SYSTEM_RECORD_ID
1978
- if SEARCH_KEY_UNNEST in df.columns:
1979
- meaning_types[SEARCH_KEY_UNNEST] = FileColumnMeaningType.UNNEST_KEY
1980
1977
 
1981
1978
  df = df.reset_index(drop=True)
1982
- system_columns_with_original_index = [SYSTEM_RECORD_ID, ENTITY_SYSTEM_RECORD_ID] + generated_features
1979
+ system_columns_with_original_index = [SYSTEM_RECORD_ID] + generated_features
1983
1980
  if add_fit_system_record_id:
1984
1981
  system_columns_with_original_index.append(SORT_ID)
1985
1982
  df_with_original_index = df[system_columns_with_original_index].copy()
1986
1983
 
1987
1984
  combined_search_keys = combine_search_keys(search_keys.keys())
1988
1985
 
1989
- df_without_features = df.drop(columns=features_not_to_pass)
1986
+ df_without_features = df.drop(columns=non_keys_columns)
1990
1987
 
1991
1988
  df_without_features = clean_full_duplicates(
1992
1989
  df_without_features, self.logger, silent=silent_mode, bundle=self.bundle
@@ -2142,14 +2139,6 @@ class FeaturesEnricher(TransformerMixin):
2142
2139
 
2143
2140
  key_types = search_keys.values()
2144
2141
 
2145
- # Multiple search keys allowed only for PHONE, IP, POSTAL_CODE, EMAIL, HEM
2146
- multi_keys = [key for key, count in Counter(key_types).items() if count > 1]
2147
- for multi_key in multi_keys:
2148
- if multi_key not in [SearchKey.PHONE, SearchKey.IP, SearchKey.POSTAL_CODE, SearchKey.EMAIL, SearchKey.HEM]:
2149
- msg = self.bundle.get("unsupported_multi_key").format(multi_key)
2150
- self.logger.warning(msg)
2151
- raise ValidationError(msg)
2152
-
2153
2142
  if SearchKey.DATE in key_types and SearchKey.DATETIME in key_types:
2154
2143
  msg = self.bundle.get("date_and_datetime_simultanious")
2155
2144
  self.logger.warning(msg)
@@ -2165,11 +2154,11 @@ class FeaturesEnricher(TransformerMixin):
2165
2154
  self.logger.warning(msg)
2166
2155
  raise ValidationError(msg)
2167
2156
 
2168
- # for key_type in SearchKey.__members__.values():
2169
- # if key_type != SearchKey.CUSTOM_KEY and list(key_types).count(key_type) > 1:
2170
- # msg = self.bundle.get("multiple_search_key").format(key_type)
2171
- # self.logger.warning(msg)
2172
- # raise ValidationError(msg)
2157
+ for key_type in SearchKey.__members__.values():
2158
+ if key_type != SearchKey.CUSTOM_KEY and list(key_types).count(key_type) > 1:
2159
+ msg = self.bundle.get("multiple_search_key").format(key_type)
2160
+ self.logger.warning(msg)
2161
+ raise ValidationError(msg)
2173
2162
 
2174
2163
  # non_personal_keys = set(SearchKey.__members__.values()) - set(SearchKey.personal_keys())
2175
2164
  # if (
@@ -2305,7 +2294,16 @@ class FeaturesEnricher(TransformerMixin):
2305
2294
  self.fit_generated_features.extend(converter.generated_features)
2306
2295
  else:
2307
2296
  self.logger.info("Input dataset hasn't date column")
2308
-
2297
+ if self.add_date_if_missing:
2298
+ df = self._add_current_date_as_key(df, self.fit_search_keys, self.logger, self.bundle)
2299
+ email_column = self._get_email_column(self.fit_search_keys)
2300
+ hem_column = self._get_hem_column(self.fit_search_keys)
2301
+ email_converted_to_hem = False
2302
+ if email_column:
2303
+ converter = EmailSearchKeyConverter(email_column, hem_column, self.fit_search_keys, self.logger)
2304
+ df = converter.convert(df)
2305
+ self.fit_generated_features.extend(converter.generated_features)
2306
+ email_converted_to_hem = converter.email_converted_to_hem
2309
2307
  if (
2310
2308
  self.detect_missing_search_keys
2311
2309
  and list(self.fit_search_keys.values()) == [SearchKey.DATE]
@@ -2314,37 +2312,7 @@ class FeaturesEnricher(TransformerMixin):
2314
2312
  converter = IpToCountrySearchKeyConverter(self.fit_search_keys, self.logger)
2315
2313
  df = converter.convert(df)
2316
2314
 
2317
- # Explode multiple search keys
2318
2315
  non_feature_columns = [self.TARGET_NAME, EVAL_SET_INDEX] + list(self.fit_search_keys.keys())
2319
- meaning_types = {
2320
- **{col: key.value for col, key in self.fit_search_keys.items()},
2321
- **{str(c): FileColumnMeaningType.FEATURE for c in df.columns if c not in non_feature_columns},
2322
- }
2323
- meaning_types[self.TARGET_NAME] = FileColumnMeaningType.TARGET
2324
- if eval_set is not None and len(eval_set) > 0:
2325
- meaning_types[EVAL_SET_INDEX] = FileColumnMeaningType.EVAL_SET_INDEX
2326
- df = self.__add_fit_system_record_id(df, meaning_types, self.fit_search_keys, ENTITY_SYSTEM_RECORD_ID)
2327
-
2328
- # TODO check that this is correct for enrichment
2329
- self.df_with_original_index = df.copy()
2330
-
2331
- df, unnest_search_keys = self._explode_multiple_search_keys(df, self.fit_search_keys)
2332
-
2333
- # Convert EMAIL to HEM after unnesting to do it only with one column
2334
- email_column = self._get_email_column(self.fit_search_keys)
2335
- hem_column = self._get_hem_column(self.fit_search_keys)
2336
- email_converted_to_hem = False
2337
- if email_column:
2338
- converter = EmailSearchKeyConverter(
2339
- email_column, hem_column, self.fit_search_keys, unnest_search_keys, self.logger
2340
- )
2341
- df = converter.convert(df)
2342
- self.fit_generated_features.extend(converter.generated_features)
2343
- email_converted_to_hem = converter.email_converted_to_hem
2344
-
2345
- non_feature_columns = [self.TARGET_NAME, EVAL_SET_INDEX, ENTITY_SYSTEM_RECORD_ID, SEARCH_KEY_UNNEST] + list(
2346
- self.fit_search_keys.keys()
2347
- )
2348
2316
  if email_converted_to_hem:
2349
2317
  non_feature_columns.append(email_column)
2350
2318
  if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
@@ -2368,14 +2336,12 @@ class FeaturesEnricher(TransformerMixin):
2368
2336
  **{str(c): FileColumnMeaningType.FEATURE for c in df.columns if c not in non_feature_columns},
2369
2337
  }
2370
2338
  meaning_types[self.TARGET_NAME] = FileColumnMeaningType.TARGET
2371
- meaning_types[ENTITY_SYSTEM_RECORD_ID] = FileColumnMeaningType.ENTITY_SYSTEM_RECORD_ID
2372
- if SEARCH_KEY_UNNEST in df.columns:
2373
- meaning_types[SEARCH_KEY_UNNEST] = FileColumnMeaningType.UNNEST_KEY
2374
2339
  if eval_set is not None and len(eval_set) > 0:
2375
2340
  meaning_types[EVAL_SET_INDEX] = FileColumnMeaningType.EVAL_SET_INDEX
2376
2341
 
2377
- df = self.__add_fit_system_record_id(df, meaning_types, self.fit_search_keys, SYSTEM_RECORD_ID)
2342
+ df = self.__add_fit_system_record_id(df, meaning_types, self.fit_search_keys)
2378
2343
 
2344
+ self.df_with_original_index = df.copy()
2379
2345
  df = df.reset_index(drop=True).sort_values(by=SYSTEM_RECORD_ID).reset_index(drop=True)
2380
2346
 
2381
2347
  combined_search_keys = combine_search_keys(self.fit_search_keys.keys())
@@ -2383,15 +2349,14 @@ class FeaturesEnricher(TransformerMixin):
2383
2349
  dataset = Dataset(
2384
2350
  "tds_" + str(uuid.uuid4()),
2385
2351
  df=df,
2386
- meaning_types=meaning_types,
2387
- search_keys=combined_search_keys,
2388
- unnest_search_keys=unnest_search_keys,
2389
2352
  model_task_type=model_task_type,
2390
2353
  date_format=self.date_format,
2391
2354
  random_state=self.random_state,
2392
2355
  rest_client=self.rest_client,
2393
2356
  logger=self.logger,
2394
2357
  )
2358
+ dataset.meaning_types = meaning_types
2359
+ dataset.search_keys = combined_search_keys
2395
2360
  if email_converted_to_hem:
2396
2361
  dataset.ignore_columns = [email_column]
2397
2362
 
@@ -2911,6 +2876,25 @@ class FeaturesEnricher(TransformerMixin):
2911
2876
  if t in [SearchKey.DATE, SearchKey.DATETIME]:
2912
2877
  return col
2913
2878
 
2879
+ @staticmethod
2880
+ def _add_current_date_as_key(
2881
+ df: pd.DataFrame, search_keys: Dict[str, SearchKey], logger: logging.Logger, bundle: ResourceBundle
2882
+ ) -> pd.DataFrame:
2883
+ if (
2884
+ set(search_keys.values()) == {SearchKey.PHONE}
2885
+ or set(search_keys.values()) == {SearchKey.EMAIL}
2886
+ or set(search_keys.values()) == {SearchKey.HEM}
2887
+ or set(search_keys.values()) == {SearchKey.COUNTRY, SearchKey.POSTAL_CODE}
2888
+ ):
2889
+ msg = bundle.get("current_date_added")
2890
+ print(msg)
2891
+ logger.warning(msg)
2892
+ df[FeaturesEnricher.CURRENT_DATE] = datetime.date.today()
2893
+ search_keys[FeaturesEnricher.CURRENT_DATE] = SearchKey.DATE
2894
+ converter = DateTimeSearchKeyConverter(FeaturesEnricher.CURRENT_DATE, None, logger, bundle)
2895
+ df = converter.convert(df)
2896
+ return df
2897
+
2914
2898
  @staticmethod
2915
2899
  def _get_group_columns(df: pd.DataFrame, search_keys: Dict[str, SearchKey]) -> List[str]:
2916
2900
  return [
@@ -2921,19 +2905,15 @@ class FeaturesEnricher(TransformerMixin):
2921
2905
 
2922
2906
  @staticmethod
2923
2907
  def _get_email_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
2924
- cols = [col for col, t in search_keys.items() if t == SearchKey.EMAIL]
2925
- if len(cols) > 1:
2926
- raise Exception("More than one email column found after unnest")
2927
- if len(cols) == 1:
2928
- return cols[0]
2908
+ for col, t in search_keys.items():
2909
+ if t == SearchKey.EMAIL:
2910
+ return col
2929
2911
 
2930
2912
  @staticmethod
2931
2913
  def _get_hem_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
2932
- cols = [col for col, t in search_keys.items() if t == SearchKey.HEM]
2933
- if len(cols) > 1:
2934
- raise Exception("More than one hem column found after unnest")
2935
- if len(cols) == 1:
2936
- return cols[0]
2914
+ for col, t in search_keys.items():
2915
+ if t == SearchKey.HEM:
2916
+ return col
2937
2917
 
2938
2918
  @staticmethod
2939
2919
  def _get_phone_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
@@ -2941,42 +2921,8 @@ class FeaturesEnricher(TransformerMixin):
2941
2921
  if t == SearchKey.PHONE:
2942
2922
  return col
2943
2923
 
2944
- def _explode_multiple_search_keys(self, df: pd.DataFrame, search_keys: Dict[str, SearchKey]) -> pd.DataFrame:
2945
- # find groups of multiple search keys
2946
- search_key_names_by_type: Dict[SearchKey, str] = dict()
2947
- for key_name, key_type in search_keys.items():
2948
- search_key_names_by_type[key_type] = search_key_names_by_type.get(key_type, []) + [key_name]
2949
- search_key_names_by_type = {
2950
- key_type: key_names for key_type, key_names in search_key_names_by_type.items() if len(key_names) > 1
2951
- }
2952
- if len(search_key_names_by_type) == 0:
2953
- return df, []
2954
-
2955
- multiple_keys_columns = [col for cols in search_key_names_by_type.values() for col in cols]
2956
- other_columns = [col for col in df.columns if col not in multiple_keys_columns]
2957
- exploded_dfs = []
2958
- unnest_search_keys = []
2959
-
2960
- for key_type, key_names in search_key_names_by_type.items():
2961
- new_search_key = f"upgini_{key_type.name.lower()}_unnest"
2962
- exploded_df = pd.melt(
2963
- df, id_vars=other_columns, value_vars=key_names, var_name=SEARCH_KEY_UNNEST, value_name=new_search_key
2964
- )
2965
- exploded_dfs.append(exploded_df)
2966
- for old_key in key_names:
2967
- del search_keys[old_key]
2968
- search_keys[new_search_key] = key_type
2969
- unnest_search_keys.append(new_search_key)
2970
-
2971
- df = pd.concat(exploded_dfs, ignore_index=True)
2972
- return df, unnest_search_keys
2973
-
2974
2924
  def __add_fit_system_record_id(
2975
- self,
2976
- df: pd.DataFrame,
2977
- meaning_types: Dict[str, FileColumnMeaningType],
2978
- search_keys: Dict[str, SearchKey],
2979
- id_name: str,
2925
+ self, df: pd.DataFrame, meaning_types: Dict[str, FileColumnMeaningType], search_keys: Dict[str, SearchKey]
2980
2926
  ) -> pd.DataFrame:
2981
2927
  # save original order or rows
2982
2928
  original_index_name = df.index.name
@@ -3025,23 +2971,19 @@ class FeaturesEnricher(TransformerMixin):
3025
2971
 
3026
2972
  df = df.reset_index(drop=True).reset_index()
3027
2973
  # system_record_id saves correct order for fit
3028
- df = df.rename(columns={DEFAULT_INDEX: id_name})
2974
+ df = df.rename(columns={DEFAULT_INDEX: SYSTEM_RECORD_ID})
3029
2975
 
3030
2976
  # return original order
3031
2977
  df = df.set_index(ORIGINAL_INDEX)
3032
2978
  df.index.name = original_index_name
3033
2979
  df = df.sort_values(by=original_order_name).drop(columns=original_order_name)
3034
2980
 
3035
- meaning_types[id_name] = (
3036
- FileColumnMeaningType.SYSTEM_RECORD_ID
3037
- if id_name == SYSTEM_RECORD_ID
3038
- else FileColumnMeaningType.ENTITY_SYSTEM_RECORD_ID
3039
- )
2981
+ meaning_types[SYSTEM_RECORD_ID] = FileColumnMeaningType.SYSTEM_RECORD_ID
3040
2982
  return df
3041
2983
 
3042
2984
  def __correct_target(self, df: pd.DataFrame) -> pd.DataFrame:
3043
2985
  target = df[self.TARGET_NAME]
3044
- if is_string_dtype(target):
2986
+ if is_string_dtype(target) or is_object_dtype(target):
3045
2987
  maybe_numeric_target = pd.to_numeric(target, errors="coerce")
3046
2988
  # If less than 5% is non numeric then leave this rows with NaN target and later it will be dropped
3047
2989
  if maybe_numeric_target.isna().sum() <= _num_samples(df) * 0.05:
@@ -3091,10 +3033,7 @@ class FeaturesEnricher(TransformerMixin):
3091
3033
  )
3092
3034
 
3093
3035
  comparing_columns = X.columns if is_transform else df_with_original_index.columns
3094
- dup_features = [
3095
- c for c in comparing_columns
3096
- if c in result_features.columns and c not in [SYSTEM_RECORD_ID, ENTITY_SYSTEM_RECORD_ID]
3097
- ]
3036
+ dup_features = [c for c in comparing_columns if c in result_features.columns and c != SYSTEM_RECORD_ID]
3098
3037
  if len(dup_features) > 0:
3099
3038
  self.logger.warning(f"X contain columns with same name as returned from backend: {dup_features}")
3100
3039
  raise ValidationError(self.bundle.get("returned_features_same_as_passed").format(dup_features))
@@ -3105,7 +3044,8 @@ class FeaturesEnricher(TransformerMixin):
3105
3044
  result_features = pd.merge(
3106
3045
  df_with_original_index,
3107
3046
  result_features,
3108
- on=ENTITY_SYSTEM_RECORD_ID,
3047
+ left_on=SYSTEM_RECORD_ID,
3048
+ right_on=SYSTEM_RECORD_ID,
3109
3049
  how="left" if is_transform else "inner",
3110
3050
  )
3111
3051
  result_features = result_features.set_index(original_index_name or DEFAULT_INDEX)
@@ -3316,6 +3256,8 @@ class FeaturesEnricher(TransformerMixin):
3316
3256
  descriptions = []
3317
3257
  for m in autofe_meta:
3318
3258
  autofe_feature = Feature.from_formula(m.formula)
3259
+ orig_to_hashed = {base_column.original_name: base_column.hashed_name for base_column in m.base_columns}
3260
+ autofe_feature.rename_columns(orig_to_hashed)
3319
3261
  autofe_feature.set_display_index(m.display_index)
3320
3262
  if autofe_feature.op.is_vector:
3321
3263
  continue
@@ -3443,7 +3385,8 @@ class FeaturesEnricher(TransformerMixin):
3443
3385
  valid_search_keys[column_name] = SearchKey.CUSTOM_KEY
3444
3386
  else:
3445
3387
  if x[column_name].isnull().all() or (
3446
- is_string_dtype(x[column_name]) and (x[column_name].astype("string").str.strip() == "").all()
3388
+ (is_string_dtype(x[column_name]) or is_object_dtype(x[column_name]))
3389
+ and (x[column_name].astype("string").str.strip() == "").all()
3447
3390
  ):
3448
3391
  raise ValidationError(self.bundle.get("empty_search_key").format(column_name))
3449
3392
 
@@ -3485,13 +3428,13 @@ class FeaturesEnricher(TransformerMixin):
3485
3428
  self.warning_counter.increment()
3486
3429
 
3487
3430
  if len(valid_search_keys) == 1:
3488
- key, value = list(valid_search_keys.items())[0]
3489
- # Show warning for country only if country is the only key
3490
- if x[key].nunique() == 1:
3491
- msg = self.bundle.get("single_constant_search_key").format(value, x[key].values[0])
3492
- print(msg)
3493
- self.logger.warning(msg)
3494
- self.warning_counter.increment()
3431
+ for k, v in valid_search_keys.items():
3432
+ # Show warning for country only if country is the only key
3433
+ if x[k].nunique() == 1 and (v != SearchKey.COUNTRY or len(valid_search_keys) == 1):
3434
+ msg = self.bundle.get("single_constant_search_key").format(v, x[k].values[0])
3435
+ print(msg)
3436
+ self.logger.warning(msg)
3437
+ self.warning_counter.increment()
3495
3438
 
3496
3439
  self.logger.info(f"Prepared search keys: {valid_search_keys}")
3497
3440
 
@@ -3601,68 +3544,61 @@ class FeaturesEnricher(TransformerMixin):
3601
3544
  def check_need_detect(search_key: SearchKey):
3602
3545
  return not is_transform or search_key in self.fit_search_keys.values()
3603
3546
 
3604
- # if SearchKey.POSTAL_CODE not in search_keys.values() and check_need_detect(SearchKey.POSTAL_CODE):
3605
- if check_need_detect(SearchKey.POSTAL_CODE):
3606
- maybe_keys = PostalCodeSearchKeyDetector().get_search_key_columns(sample, search_keys)
3607
- if maybe_keys:
3608
- new_keys = {key: SearchKey.POSTAL_CODE for key in maybe_keys}
3609
- search_keys.update(new_keys)
3610
- self.autodetected_search_keys.update(new_keys)
3611
- self.logger.info(f"Autodetected search key POSTAL_CODE in column {maybe_keys}")
3547
+ if SearchKey.POSTAL_CODE not in search_keys.values() and check_need_detect(SearchKey.POSTAL_CODE):
3548
+ maybe_key = PostalCodeSearchKeyDetector().get_search_key_column(sample)
3549
+ if maybe_key is not None:
3550
+ search_keys[maybe_key] = SearchKey.POSTAL_CODE
3551
+ self.autodetected_search_keys[maybe_key] = SearchKey.POSTAL_CODE
3552
+ self.logger.info(f"Autodetected search key POSTAL_CODE in column {maybe_key}")
3612
3553
  if not silent_mode:
3613
- print(self.bundle.get("postal_code_detected").format(maybe_keys))
3554
+ print(self.bundle.get("postal_code_detected").format(maybe_key))
3614
3555
 
3615
3556
  if (
3616
3557
  SearchKey.COUNTRY not in search_keys.values()
3617
3558
  and self.country_code is None
3618
3559
  and check_need_detect(SearchKey.COUNTRY)
3619
3560
  ):
3620
- maybe_key = CountrySearchKeyDetector().get_search_key_columns(sample, search_keys)
3621
- if maybe_key:
3622
- search_keys[maybe_key[0]] = SearchKey.COUNTRY
3623
- self.autodetected_search_keys[maybe_key[0]] = SearchKey.COUNTRY
3561
+ maybe_key = CountrySearchKeyDetector().get_search_key_column(sample)
3562
+ if maybe_key is not None:
3563
+ search_keys[maybe_key] = SearchKey.COUNTRY
3564
+ self.autodetected_search_keys[maybe_key] = SearchKey.COUNTRY
3624
3565
  self.logger.info(f"Autodetected search key COUNTRY in column {maybe_key}")
3625
3566
  if not silent_mode:
3626
3567
  print(self.bundle.get("country_detected").format(maybe_key))
3627
3568
 
3628
3569
  if (
3629
- # SearchKey.EMAIL not in search_keys.values()
3630
- SearchKey.HEM not in search_keys.values()
3570
+ SearchKey.EMAIL not in search_keys.values()
3571
+ and SearchKey.HEM not in search_keys.values()
3631
3572
  and check_need_detect(SearchKey.HEM)
3632
3573
  ):
3633
- maybe_keys = EmailSearchKeyDetector().get_search_key_columns(sample, search_keys)
3634
- if maybe_keys:
3574
+ maybe_key = EmailSearchKeyDetector().get_search_key_column(sample)
3575
+ if maybe_key is not None and maybe_key not in search_keys.keys():
3635
3576
  if self.__is_registered or is_demo_dataset:
3636
- new_keys = {key: SearchKey.EMAIL for key in maybe_keys}
3637
- search_keys.update(new_keys)
3638
- self.autodetected_search_keys.update(new_keys)
3639
- self.logger.info(f"Autodetected search key EMAIL in column {maybe_keys}")
3577
+ search_keys[maybe_key] = SearchKey.EMAIL
3578
+ self.autodetected_search_keys[maybe_key] = SearchKey.EMAIL
3579
+ self.logger.info(f"Autodetected search key EMAIL in column {maybe_key}")
3640
3580
  if not silent_mode:
3641
- print(self.bundle.get("email_detected").format(maybe_keys))
3581
+ print(self.bundle.get("email_detected").format(maybe_key))
3642
3582
  else:
3643
3583
  self.logger.warning(
3644
- f"Autodetected search key EMAIL in column {maybe_keys}."
3645
- " But not used because not registered user"
3584
+ f"Autodetected search key EMAIL in column {maybe_key}. But not used because not registered user"
3646
3585
  )
3647
3586
  if not silent_mode:
3648
- print(self.bundle.get("email_detected_not_registered").format(maybe_keys))
3587
+ print(self.bundle.get("email_detected_not_registered").format(maybe_key))
3649
3588
  self.warning_counter.increment()
3650
3589
 
3651
- # if SearchKey.PHONE not in search_keys.values() and check_need_detect(SearchKey.PHONE):
3652
- if check_need_detect(SearchKey.PHONE):
3653
- maybe_keys = PhoneSearchKeyDetector().get_search_key_columns(sample, search_keys)
3654
- if maybe_keys:
3590
+ if SearchKey.PHONE not in search_keys.values() and check_need_detect(SearchKey.PHONE):
3591
+ maybe_key = PhoneSearchKeyDetector().get_search_key_column(sample)
3592
+ if maybe_key is not None and maybe_key not in search_keys.keys():
3655
3593
  if self.__is_registered or is_demo_dataset:
3656
- new_keys = {key: SearchKey.PHONE for key in maybe_keys}
3657
- search_keys.update(new_keys)
3658
- self.autodetected_search_keys.update(new_keys)
3659
- self.logger.info(f"Autodetected search key PHONE in column {maybe_keys}")
3594
+ search_keys[maybe_key] = SearchKey.PHONE
3595
+ self.autodetected_search_keys[maybe_key] = SearchKey.PHONE
3596
+ self.logger.info(f"Autodetected search key PHONE in column {maybe_key}")
3660
3597
  if not silent_mode:
3661
- print(self.bundle.get("phone_detected").format(maybe_keys))
3598
+ print(self.bundle.get("phone_detected").format(maybe_key))
3662
3599
  else:
3663
3600
  self.logger.warning(
3664
- f"Autodetected search key PHONE in column {maybe_keys}. "
3665
- "But not used because not registered user"
3601
+ f"Autodetected search key PHONE in column {maybe_key}. But not used because not registered user"
3666
3602
  )
3667
3603
  if not silent_mode:
3668
3604
  print(self.bundle.get("phone_detected_not_registered"))