upgini 1.1.276__py3-none-any.whl → 1.1.278a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
upgini/dataset.py CHANGED
@@ -23,7 +23,9 @@ from pandas.api.types import (
23
23
  from upgini.errors import ValidationError
24
24
  from upgini.http import ProgressStage, SearchProgress, _RestClient
25
25
  from upgini.metadata import (
26
+ ENTITY_SYSTEM_RECORD_ID,
26
27
  EVAL_SET_INDEX,
28
+ SEARCH_KEY_UNNEST,
27
29
  SYSTEM_COLUMNS,
28
30
  SYSTEM_RECORD_ID,
29
31
  TARGET,
@@ -79,6 +81,7 @@ class Dataset: # (pd.DataFrame):
79
81
  path: Optional[str] = None,
80
82
  meaning_types: Optional[Dict[str, FileColumnMeaningType]] = None,
81
83
  search_keys: Optional[List[Tuple[str, ...]]] = None,
84
+ unnest_search_keys: Optional[Dict[str, str]] = None,
82
85
  model_task_type: Optional[ModelTaskType] = None,
83
86
  random_state: Optional[int] = None,
84
87
  rest_client: Optional[_RestClient] = None,
@@ -113,6 +116,7 @@ class Dataset: # (pd.DataFrame):
113
116
  self.description = description
114
117
  self.meaning_types = meaning_types
115
118
  self.search_keys = search_keys
119
+ self.unnest_search_keys = unnest_search_keys
116
120
  self.ignore_columns = []
117
121
  self.hierarchical_group_keys = []
118
122
  self.hierarchical_subgroup_keys = []
@@ -172,7 +176,7 @@ class Dataset: # (pd.DataFrame):
172
176
  new_columns = []
173
177
  dup_counter = 0
174
178
  for column in self.data.columns:
175
- if column in [TARGET, EVAL_SET_INDEX, SYSTEM_RECORD_ID]:
179
+ if column in [TARGET, EVAL_SET_INDEX, SYSTEM_RECORD_ID, ENTITY_SYSTEM_RECORD_ID, SEARCH_KEY_UNNEST]:
176
180
  self.columns_renaming[column] = column
177
181
  new_columns.append(column)
178
182
  continue
@@ -353,7 +357,9 @@ class Dataset: # (pd.DataFrame):
353
357
 
354
358
  if is_string_dtype(self.data[postal_code]) or is_object_dtype(self.data[postal_code]):
355
359
  try:
356
- self.data[postal_code] = self.data[postal_code].astype("float64").astype("Int64").astype("string")
360
+ self.data[postal_code] = (
361
+ self.data[postal_code].astype("string").astype("Float64").astype("Int64").astype("string")
362
+ )
357
363
  except Exception:
358
364
  pass
359
365
  elif is_float_dtype(self.data[postal_code]):
@@ -803,6 +809,9 @@ class Dataset: # (pd.DataFrame):
803
809
  meaningType=meaning_type,
804
810
  minMaxValues=min_max_values,
805
811
  )
812
+ if self.unnest_search_keys and column_meta.originalName in self.unnest_search_keys:
813
+ column_meta.isUnnest = True
814
+ column_meta.unnestKeyNames = self.unnest_search_keys[column_meta.originalName]
806
815
 
807
816
  columns.append(column_meta)
808
817
 
@@ -11,6 +11,7 @@ import sys
11
11
  import tempfile
12
12
  import time
13
13
  import uuid
14
+ from collections import Counter
14
15
  from dataclasses import dataclass
15
16
  from threading import Thread
16
17
  from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union
@@ -45,9 +46,11 @@ from upgini.mdc import MDC
45
46
  from upgini.metadata import (
46
47
  COUNTRY,
47
48
  DEFAULT_INDEX,
49
+ ENTITY_SYSTEM_RECORD_ID,
48
50
  EVAL_SET_INDEX,
49
51
  ORIGINAL_INDEX,
50
52
  RENAMED_INDEX,
53
+ SEARCH_KEY_UNNEST,
51
54
  SORT_ID,
52
55
  SYSTEM_RECORD_ID,
53
56
  TARGET,
@@ -248,7 +251,7 @@ class FeaturesEnricher(TransformerMixin):
248
251
  self.__cached_sampled_datasets: Optional[Tuple[pd.DataFrame, pd.DataFrame, pd.Series, Dict, Dict]] = None
249
252
 
250
253
  validate_version(self.logger)
251
- self.search_keys = search_keys or dict()
254
+ self.search_keys = search_keys or {}
252
255
  self.country_code = country_code
253
256
  self.__validate_search_keys(search_keys, search_id)
254
257
  self.model_task_type = model_task_type
@@ -1188,7 +1191,7 @@ class FeaturesEnricher(TransformerMixin):
1188
1191
  email_column = self._get_email_column(search_keys)
1189
1192
  hem_column = self._get_hem_column(search_keys)
1190
1193
  if email_column:
1191
- converter = EmailSearchKeyConverter(email_column, hem_column, search_keys, self.logger)
1194
+ converter = EmailSearchKeyConverter(email_column, hem_column, search_keys, [], self.logger)
1192
1195
  extended_X = converter.convert(extended_X)
1193
1196
  generated_features.extend(converter.generated_features)
1194
1197
  if (
@@ -1404,7 +1407,7 @@ class FeaturesEnricher(TransformerMixin):
1404
1407
  fitting_enriched_X[col].astype("string").str.replace(",", ".").astype(np.float64)
1405
1408
  )
1406
1409
 
1407
- fitting_eval_set_dict = dict()
1410
+ fitting_eval_set_dict = {}
1408
1411
  for idx, eval_tuple in eval_set_sampled_dict.items():
1409
1412
  eval_X_sampled, enriched_eval_X, eval_y_sampled = eval_tuple
1410
1413
  eval_X_sorted, eval_y_sorted = self._sort_by_system_record_id(eval_X_sampled, eval_y_sampled, self.cv)
@@ -1516,7 +1519,7 @@ class FeaturesEnricher(TransformerMixin):
1516
1519
  def __sample_only_input(
1517
1520
  self, validated_X: pd.DataFrame, validated_y: pd.Series, eval_set: Optional[List[tuple]], is_demo_dataset: bool
1518
1521
  ) -> _SampledDataForMetrics:
1519
- eval_set_sampled_dict = dict()
1522
+ eval_set_sampled_dict = {}
1520
1523
 
1521
1524
  df = validated_X.copy()
1522
1525
  df[TARGET] = validated_y
@@ -1542,7 +1545,7 @@ class FeaturesEnricher(TransformerMixin):
1542
1545
  df = df.sample(n=sample_rows, random_state=self.random_state)
1543
1546
 
1544
1547
  df_extended, search_keys = self._extend_x(df, is_demo_dataset)
1545
- df_extended = self.__add_fit_system_record_id(df_extended, dict(), search_keys)
1548
+ df_extended = self.__add_fit_system_record_id(df_extended, {}, search_keys)
1546
1549
 
1547
1550
  train_df = df_extended.query(f"{EVAL_SET_INDEX} == 0") if eval_set is not None else df_extended
1548
1551
  X_sampled = train_df.drop(columns=[TARGET, EVAL_SET_INDEX], errors="ignore")
@@ -1566,7 +1569,7 @@ class FeaturesEnricher(TransformerMixin):
1566
1569
  trace_id: str,
1567
1570
  remove_outliers_calc_metrics: Optional[bool],
1568
1571
  ) -> _SampledDataForMetrics:
1569
- eval_set_sampled_dict = dict()
1572
+ eval_set_sampled_dict = {}
1570
1573
  search_keys = self.fit_search_keys
1571
1574
 
1572
1575
  rows_to_drop = None
@@ -1640,7 +1643,7 @@ class FeaturesEnricher(TransformerMixin):
1640
1643
  progress_bar: Optional[ProgressBar],
1641
1644
  progress_callback: Optional[Callable[[SearchProgress], Any]],
1642
1645
  ) -> _SampledDataForMetrics:
1643
- eval_set_sampled_dict = dict()
1646
+ eval_set_sampled_dict = {}
1644
1647
  if eval_set is not None:
1645
1648
  self.logger.info("Transform with eval_set")
1646
1649
  # concatenate X and eval_set with eval_set_index
@@ -1662,7 +1665,7 @@ class FeaturesEnricher(TransformerMixin):
1662
1665
  self.logger.info(f"Downsampling from {num_samples} to {Dataset.FIT_SAMPLE_WITH_EVAL_SET_ROWS}")
1663
1666
  df = df.sample(n=Dataset.FIT_SAMPLE_WITH_EVAL_SET_ROWS, random_state=self.random_state)
1664
1667
 
1665
- eval_set_sampled_dict = dict()
1668
+ eval_set_sampled_dict = {}
1666
1669
 
1667
1670
  tmp_target_name = "__target"
1668
1671
  df = df.rename(columns={TARGET: tmp_target_name})
@@ -1925,11 +1928,38 @@ class FeaturesEnricher(TransformerMixin):
1925
1928
  self.logger.info("Input dataset hasn't date column")
1926
1929
  if self.add_date_if_missing:
1927
1930
  df = self._add_current_date_as_key(df, search_keys, self.logger, self.bundle)
1931
+
1932
+ # Don't pass all features in backend on transform
1933
+ original_features_for_transform = []
1934
+ runtime_parameters = self._get_copy_of_runtime_parameters()
1935
+ features_not_to_pass = [column for column in df.columns if column not in search_keys.keys()]
1936
+ if len(features_not_to_pass) > 0:
1937
+ # Pass only features that need for transform
1938
+ features_for_transform = self._search_task.get_features_for_transform()
1939
+ if features_for_transform is not None and len(features_for_transform) > 0:
1940
+ file_metadata = self._search_task.get_file_metadata(trace_id)
1941
+ original_features_for_transform = [
1942
+ c.originalName or c.name for c in file_metadata.columns if c.name in features_for_transform
1943
+ ]
1944
+
1945
+ runtime_parameters.properties["features_for_embeddings"] = ",".join(features_for_transform)
1946
+
1947
+ columns_for_system_record_id = sorted(list(search_keys.keys()) + (original_features_for_transform))
1948
+
1949
+ df[ENTITY_SYSTEM_RECORD_ID] = pd.util.hash_pandas_object(
1950
+ df[columns_for_system_record_id], index=False
1951
+ ).astype("Float64")
1952
+
1953
+ # Explode multiple search keys
1954
+ df, unnest_search_keys = self._explode_multiple_search_keys(df, search_keys)
1955
+
1928
1956
  email_column = self._get_email_column(search_keys)
1929
1957
  hem_column = self._get_hem_column(search_keys)
1930
1958
  email_converted_to_hem = False
1931
1959
  if email_column:
1932
- converter = EmailSearchKeyConverter(email_column, hem_column, search_keys, self.logger)
1960
+ converter = EmailSearchKeyConverter(
1961
+ email_column, hem_column, search_keys, list(unnest_search_keys.keys()), self.logger
1962
+ )
1933
1963
  df = converter.convert(df)
1934
1964
  generated_features.extend(converter.generated_features)
1935
1965
  email_converted_to_hem = converter.email_converted_to_hem
@@ -1943,30 +1973,21 @@ class FeaturesEnricher(TransformerMixin):
1943
1973
  generated_features = [f for f in generated_features if f in self.fit_generated_features]
1944
1974
 
1945
1975
  meaning_types = {col: key.value for col, key in search_keys.items()}
1946
- non_keys_columns = [column for column in df.columns if column not in search_keys.keys()]
1976
+ # non_keys_columns = [column for column in df.columns if column not in search_keys.keys()]
1977
+ for col in original_features_for_transform:
1978
+ meaning_types[col] = FileColumnMeaningType.FEATURE
1979
+ features_not_to_pass = [column for column in features_not_to_pass if column not in search_keys.keys()]
1947
1980
 
1948
1981
  if email_converted_to_hem:
1949
- non_keys_columns.append(email_column)
1950
-
1951
- # Don't pass features in backend on transform
1952
- original_features_for_transform = None
1953
- runtime_parameters = self._get_copy_of_runtime_parameters()
1954
- if len(non_keys_columns) > 0:
1955
- # Pass only features that need for transform
1956
- features_for_transform = self._search_task.get_features_for_transform()
1957
- if features_for_transform is not None and len(features_for_transform) > 0:
1958
- file_metadata = self._search_task.get_file_metadata(trace_id)
1959
- original_features_for_transform = [
1960
- c.originalName or c.name for c in file_metadata.columns if c.name in features_for_transform
1961
- ]
1962
- non_keys_columns = [c for c in non_keys_columns if c not in original_features_for_transform]
1982
+ features_not_to_pass.append(email_column)
1963
1983
 
1964
- runtime_parameters.properties["features_for_embeddings"] = ",".join(features_for_transform)
1984
+ features_not_to_pass = [c for c in features_not_to_pass if c not in original_features_for_transform]
1985
+ columns_for_system_record_id = sorted(list(search_keys.keys()) + (original_features_for_transform))
1965
1986
 
1966
1987
  if add_fit_system_record_id:
1967
- df = self.__add_fit_system_record_id(df, dict(), search_keys)
1988
+ df = self.__add_fit_system_record_id(df, {}, search_keys)
1968
1989
  df = df.rename(columns={SYSTEM_RECORD_ID: SORT_ID})
1969
- non_keys_columns.append(SORT_ID)
1990
+ features_not_to_pass.append(SORT_ID)
1970
1991
 
1971
1992
  columns_for_system_record_id = sorted(list(search_keys.keys()) + (original_features_for_transform or []))
1972
1993
 
@@ -1974,16 +1995,19 @@ class FeaturesEnricher(TransformerMixin):
1974
1995
  "Float64"
1975
1996
  )
1976
1997
  meaning_types[SYSTEM_RECORD_ID] = FileColumnMeaningType.SYSTEM_RECORD_ID
1998
+ meaning_types[ENTITY_SYSTEM_RECORD_ID] = FileColumnMeaningType.ENTITY_SYSTEM_RECORD_ID
1999
+ if SEARCH_KEY_UNNEST in df.columns:
2000
+ meaning_types[SEARCH_KEY_UNNEST] = FileColumnMeaningType.UNNEST_KEY
1977
2001
 
1978
2002
  df = df.reset_index(drop=True)
1979
- system_columns_with_original_index = [SYSTEM_RECORD_ID] + generated_features
2003
+ system_columns_with_original_index = [SYSTEM_RECORD_ID, ENTITY_SYSTEM_RECORD_ID] + generated_features
1980
2004
  if add_fit_system_record_id:
1981
2005
  system_columns_with_original_index.append(SORT_ID)
1982
2006
  df_with_original_index = df[system_columns_with_original_index].copy()
1983
2007
 
1984
2008
  combined_search_keys = combine_search_keys(search_keys.keys())
1985
2009
 
1986
- df_without_features = df.drop(columns=non_keys_columns)
2010
+ df_without_features = df.drop(columns=features_not_to_pass)
1987
2011
 
1988
2012
  df_without_features = clean_full_duplicates(
1989
2013
  df_without_features, self.logger, silent=silent_mode, bundle=self.bundle
@@ -1995,12 +2019,13 @@ class FeaturesEnricher(TransformerMixin):
1995
2019
  dataset = Dataset(
1996
2020
  "sample_" + str(uuid.uuid4()),
1997
2021
  df=df_without_features,
2022
+ meaning_types=meaning_types,
2023
+ search_keys=combined_search_keys,
2024
+ unnest_search_keys=unnest_search_keys,
1998
2025
  date_format=self.date_format,
1999
2026
  rest_client=self.rest_client,
2000
2027
  logger=self.logger,
2001
2028
  )
2002
- dataset.meaning_types = meaning_types
2003
- dataset.search_keys = combined_search_keys
2004
2029
  if email_converted_to_hem:
2005
2030
  dataset.ignore_columns = [email_column]
2006
2031
 
@@ -2139,6 +2164,14 @@ class FeaturesEnricher(TransformerMixin):
2139
2164
 
2140
2165
  key_types = search_keys.values()
2141
2166
 
2167
+ # Multiple search keys allowed only for PHONE, IP, POSTAL_CODE, EMAIL, HEM
2168
+ multi_keys = [key for key, count in Counter(key_types).items() if count > 1]
2169
+ for multi_key in multi_keys:
2170
+ if multi_key not in [SearchKey.PHONE, SearchKey.IP, SearchKey.POSTAL_CODE, SearchKey.EMAIL, SearchKey.HEM]:
2171
+ msg = self.bundle.get("unsupported_multi_key").format(multi_key)
2172
+ self.logger.warning(msg)
2173
+ raise ValidationError(msg)
2174
+
2142
2175
  if SearchKey.DATE in key_types and SearchKey.DATETIME in key_types:
2143
2176
  msg = self.bundle.get("date_and_datetime_simultanious")
2144
2177
  self.logger.warning(msg)
@@ -2154,11 +2187,11 @@ class FeaturesEnricher(TransformerMixin):
2154
2187
  self.logger.warning(msg)
2155
2188
  raise ValidationError(msg)
2156
2189
 
2157
- for key_type in SearchKey.__members__.values():
2158
- if key_type != SearchKey.CUSTOM_KEY and list(key_types).count(key_type) > 1:
2159
- msg = self.bundle.get("multiple_search_key").format(key_type)
2160
- self.logger.warning(msg)
2161
- raise ValidationError(msg)
2190
+ # for key_type in SearchKey.__members__.values():
2191
+ # if key_type != SearchKey.CUSTOM_KEY and list(key_types).count(key_type) > 1:
2192
+ # msg = self.bundle.get("multiple_search_key").format(key_type)
2193
+ # self.logger.warning(msg)
2194
+ # raise ValidationError(msg)
2162
2195
 
2163
2196
  # non_personal_keys = set(SearchKey.__members__.values()) - set(SearchKey.personal_keys())
2164
2197
  # if (
@@ -2296,14 +2329,6 @@ class FeaturesEnricher(TransformerMixin):
2296
2329
  self.logger.info("Input dataset hasn't date column")
2297
2330
  if self.add_date_if_missing:
2298
2331
  df = self._add_current_date_as_key(df, self.fit_search_keys, self.logger, self.bundle)
2299
- email_column = self._get_email_column(self.fit_search_keys)
2300
- hem_column = self._get_hem_column(self.fit_search_keys)
2301
- email_converted_to_hem = False
2302
- if email_column:
2303
- converter = EmailSearchKeyConverter(email_column, hem_column, self.fit_search_keys, self.logger)
2304
- df = converter.convert(df)
2305
- self.fit_generated_features.extend(converter.generated_features)
2306
- email_converted_to_hem = converter.email_converted_to_hem
2307
2332
  if (
2308
2333
  self.detect_missing_search_keys
2309
2334
  and list(self.fit_search_keys.values()) == [SearchKey.DATE]
@@ -2312,7 +2337,37 @@ class FeaturesEnricher(TransformerMixin):
2312
2337
  converter = IpToCountrySearchKeyConverter(self.fit_search_keys, self.logger)
2313
2338
  df = converter.convert(df)
2314
2339
 
2340
+ # Explode multiple search keys
2315
2341
  non_feature_columns = [self.TARGET_NAME, EVAL_SET_INDEX] + list(self.fit_search_keys.keys())
2342
+ meaning_types = {
2343
+ **{col: key.value for col, key in self.fit_search_keys.items()},
2344
+ **{str(c): FileColumnMeaningType.FEATURE for c in df.columns if c not in non_feature_columns},
2345
+ }
2346
+ meaning_types[self.TARGET_NAME] = FileColumnMeaningType.TARGET
2347
+ if eval_set is not None and len(eval_set) > 0:
2348
+ meaning_types[EVAL_SET_INDEX] = FileColumnMeaningType.EVAL_SET_INDEX
2349
+ df = self.__add_fit_system_record_id(df, meaning_types, self.fit_search_keys, ENTITY_SYSTEM_RECORD_ID)
2350
+
2351
+ # TODO check that this is correct for enrichment
2352
+ self.df_with_original_index = df.copy()
2353
+
2354
+ df, unnest_search_keys = self._explode_multiple_search_keys(df, self.fit_search_keys)
2355
+
2356
+ # Convert EMAIL to HEM after unnesting to do it only with one column
2357
+ email_column = self._get_email_column(self.fit_search_keys)
2358
+ hem_column = self._get_hem_column(self.fit_search_keys)
2359
+ email_converted_to_hem = False
2360
+ if email_column:
2361
+ converter = EmailSearchKeyConverter(
2362
+ email_column, hem_column, self.fit_search_keys, list(unnest_search_keys.keys()), self.logger
2363
+ )
2364
+ df = converter.convert(df)
2365
+ self.fit_generated_features.extend(converter.generated_features)
2366
+ email_converted_to_hem = converter.email_converted_to_hem
2367
+
2368
+ non_feature_columns = [self.TARGET_NAME, EVAL_SET_INDEX, ENTITY_SYSTEM_RECORD_ID, SEARCH_KEY_UNNEST] + list(
2369
+ self.fit_search_keys.keys()
2370
+ )
2316
2371
  if email_converted_to_hem:
2317
2372
  non_feature_columns.append(email_column)
2318
2373
  if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
@@ -2336,12 +2391,14 @@ class FeaturesEnricher(TransformerMixin):
2336
2391
  **{str(c): FileColumnMeaningType.FEATURE for c in df.columns if c not in non_feature_columns},
2337
2392
  }
2338
2393
  meaning_types[self.TARGET_NAME] = FileColumnMeaningType.TARGET
2394
+ meaning_types[ENTITY_SYSTEM_RECORD_ID] = FileColumnMeaningType.ENTITY_SYSTEM_RECORD_ID
2395
+ if SEARCH_KEY_UNNEST in df.columns:
2396
+ meaning_types[SEARCH_KEY_UNNEST] = FileColumnMeaningType.UNNEST_KEY
2339
2397
  if eval_set is not None and len(eval_set) > 0:
2340
2398
  meaning_types[EVAL_SET_INDEX] = FileColumnMeaningType.EVAL_SET_INDEX
2341
2399
 
2342
- df = self.__add_fit_system_record_id(df, meaning_types, self.fit_search_keys)
2400
+ df = self.__add_fit_system_record_id(df, meaning_types, self.fit_search_keys, SYSTEM_RECORD_ID)
2343
2401
 
2344
- self.df_with_original_index = df.copy()
2345
2402
  df = df.reset_index(drop=True).sort_values(by=SYSTEM_RECORD_ID).reset_index(drop=True)
2346
2403
 
2347
2404
  combined_search_keys = combine_search_keys(self.fit_search_keys.keys())
@@ -2349,14 +2406,15 @@ class FeaturesEnricher(TransformerMixin):
2349
2406
  dataset = Dataset(
2350
2407
  "tds_" + str(uuid.uuid4()),
2351
2408
  df=df,
2409
+ meaning_types=meaning_types,
2410
+ search_keys=combined_search_keys,
2411
+ unnest_search_keys=unnest_search_keys,
2352
2412
  model_task_type=model_task_type,
2353
2413
  date_format=self.date_format,
2354
2414
  random_state=self.random_state,
2355
2415
  rest_client=self.rest_client,
2356
2416
  logger=self.logger,
2357
2417
  )
2358
- dataset.meaning_types = meaning_types
2359
- dataset.search_keys = combined_search_keys
2360
2418
  if email_converted_to_hem:
2361
2419
  dataset.ignore_columns = [email_column]
2362
2420
 
@@ -2905,15 +2963,19 @@ class FeaturesEnricher(TransformerMixin):
2905
2963
 
2906
2964
  @staticmethod
2907
2965
  def _get_email_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
2908
- for col, t in search_keys.items():
2909
- if t == SearchKey.EMAIL:
2910
- return col
2966
+ cols = [col for col, t in search_keys.items() if t == SearchKey.EMAIL]
2967
+ if len(cols) > 1:
2968
+ raise Exception("More than one email column found after unnest")
2969
+ if len(cols) == 1:
2970
+ return cols[0]
2911
2971
 
2912
2972
  @staticmethod
2913
2973
  def _get_hem_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
2914
- for col, t in search_keys.items():
2915
- if t == SearchKey.HEM:
2916
- return col
2974
+ cols = [col for col, t in search_keys.items() if t == SearchKey.HEM]
2975
+ if len(cols) > 1:
2976
+ raise Exception("More than one hem column found after unnest")
2977
+ if len(cols) == 1:
2978
+ return cols[0]
2917
2979
 
2918
2980
  @staticmethod
2919
2981
  def _get_phone_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
@@ -2921,8 +2983,44 @@ class FeaturesEnricher(TransformerMixin):
2921
2983
  if t == SearchKey.PHONE:
2922
2984
  return col
2923
2985
 
2986
+ def _explode_multiple_search_keys(
2987
+ self, df: pd.DataFrame, search_keys: Dict[str, SearchKey]
2988
+ ) -> Tuple[pd.DataFrame, Dict[str, List[str]]]:
2989
+ # find groups of multiple search keys
2990
+ search_key_names_by_type: Dict[SearchKey, str] = {}
2991
+ for key_name, key_type in search_keys.items():
2992
+ search_key_names_by_type[key_type] = search_key_names_by_type.get(key_type, []) + [key_name]
2993
+ search_key_names_by_type = {
2994
+ key_type: key_names for key_type, key_names in search_key_names_by_type.items() if len(key_names) > 1
2995
+ }
2996
+ if len(search_key_names_by_type) == 0:
2997
+ return df, {}
2998
+
2999
+ multiple_keys_columns = [col for cols in search_key_names_by_type.values() for col in cols]
3000
+ other_columns = [col for col in df.columns if col not in multiple_keys_columns]
3001
+ exploded_dfs = []
3002
+ unnest_search_keys = {}
3003
+
3004
+ for key_type, key_names in search_key_names_by_type.items():
3005
+ new_search_key = f"upgini_{key_type.name.lower()}_unnest"
3006
+ exploded_df = pd.melt(
3007
+ df, id_vars=other_columns, value_vars=key_names, var_name=SEARCH_KEY_UNNEST, value_name=new_search_key
3008
+ )
3009
+ exploded_dfs.append(exploded_df)
3010
+ for old_key in key_names:
3011
+ del search_keys[old_key]
3012
+ search_keys[new_search_key] = key_type
3013
+ unnest_search_keys[new_search_key] = key_names
3014
+
3015
+ df = pd.concat(exploded_dfs, ignore_index=True)
3016
+ return df, unnest_search_keys
3017
+
2924
3018
  def __add_fit_system_record_id(
2925
- self, df: pd.DataFrame, meaning_types: Dict[str, FileColumnMeaningType], search_keys: Dict[str, SearchKey]
3019
+ self,
3020
+ df: pd.DataFrame,
3021
+ meaning_types: Dict[str, FileColumnMeaningType],
3022
+ search_keys: Dict[str, SearchKey],
3023
+ id_name: str,
2926
3024
  ) -> pd.DataFrame:
2927
3025
  # save original order or rows
2928
3026
  original_index_name = df.index.name
@@ -2971,14 +3069,18 @@ class FeaturesEnricher(TransformerMixin):
2971
3069
 
2972
3070
  df = df.reset_index(drop=True).reset_index()
2973
3071
  # system_record_id saves correct order for fit
2974
- df = df.rename(columns={DEFAULT_INDEX: SYSTEM_RECORD_ID})
3072
+ df = df.rename(columns={DEFAULT_INDEX: id_name})
2975
3073
 
2976
3074
  # return original order
2977
3075
  df = df.set_index(ORIGINAL_INDEX)
2978
3076
  df.index.name = original_index_name
2979
3077
  df = df.sort_values(by=original_order_name).drop(columns=original_order_name)
2980
3078
 
2981
- meaning_types[SYSTEM_RECORD_ID] = FileColumnMeaningType.SYSTEM_RECORD_ID
3079
+ meaning_types[id_name] = (
3080
+ FileColumnMeaningType.SYSTEM_RECORD_ID
3081
+ if id_name == SYSTEM_RECORD_ID
3082
+ else FileColumnMeaningType.ENTITY_SYSTEM_RECORD_ID
3083
+ )
2982
3084
  return df
2983
3085
 
2984
3086
  def __correct_target(self, df: pd.DataFrame) -> pd.DataFrame:
@@ -3033,7 +3135,11 @@ class FeaturesEnricher(TransformerMixin):
3033
3135
  )
3034
3136
 
3035
3137
  comparing_columns = X.columns if is_transform else df_with_original_index.columns
3036
- dup_features = [c for c in comparing_columns if c in result_features.columns and c != SYSTEM_RECORD_ID]
3138
+ dup_features = [
3139
+ c
3140
+ for c in comparing_columns
3141
+ if c in result_features.columns and c not in [SYSTEM_RECORD_ID, ENTITY_SYSTEM_RECORD_ID]
3142
+ ]
3037
3143
  if len(dup_features) > 0:
3038
3144
  self.logger.warning(f"X contain columns with same name as returned from backend: {dup_features}")
3039
3145
  raise ValidationError(self.bundle.get("returned_features_same_as_passed").format(dup_features))
@@ -3044,8 +3150,7 @@ class FeaturesEnricher(TransformerMixin):
3044
3150
  result_features = pd.merge(
3045
3151
  df_with_original_index,
3046
3152
  result_features,
3047
- left_on=SYSTEM_RECORD_ID,
3048
- right_on=SYSTEM_RECORD_ID,
3153
+ on=ENTITY_SYSTEM_RECORD_ID,
3049
3154
  how="left" if is_transform else "inner",
3050
3155
  )
3051
3156
  result_features = result_features.set_index(original_index_name or DEFAULT_INDEX)
@@ -3056,7 +3161,7 @@ class FeaturesEnricher(TransformerMixin):
3056
3161
  result_features = result_features[~result_features[SYSTEM_RECORD_ID].isin(rows_to_drop[SYSTEM_RECORD_ID])]
3057
3162
  self.logger.info(f"After dropping target outliers size: {len(result_features)}")
3058
3163
 
3059
- result_eval_sets = dict()
3164
+ result_eval_sets = {}
3060
3165
  if not is_transform and EVAL_SET_INDEX in result_features.columns:
3061
3166
  result_train_features = result_features.loc[result_features[EVAL_SET_INDEX] == 0].copy()
3062
3167
  eval_set_indices = list(result_features[EVAL_SET_INDEX].unique())
@@ -3262,7 +3367,7 @@ class FeaturesEnricher(TransformerMixin):
3262
3367
  if autofe_feature.op.is_vector:
3263
3368
  continue
3264
3369
 
3265
- description = dict()
3370
+ description = {}
3266
3371
 
3267
3372
  feature_meta = get_feature_by_name(autofe_feature.get_display_name(shorten=True))
3268
3373
  if feature_meta is None:
@@ -3428,13 +3533,13 @@ class FeaturesEnricher(TransformerMixin):
3428
3533
  self.warning_counter.increment()
3429
3534
 
3430
3535
  if len(valid_search_keys) == 1:
3431
- for k, v in valid_search_keys.items():
3432
- # Show warning for country only if country is the only key
3433
- if x[k].nunique() == 1 and (v != SearchKey.COUNTRY or len(valid_search_keys) == 1):
3434
- msg = self.bundle.get("single_constant_search_key").format(v, x[k].values[0])
3435
- print(msg)
3436
- self.logger.warning(msg)
3437
- self.warning_counter.increment()
3536
+ key, value = list(valid_search_keys.items())[0]
3537
+ # Show warning for country only if country is the only key
3538
+ if x[key].nunique() == 1:
3539
+ msg = self.bundle.get("single_constant_search_key").format(value, x[key].values[0])
3540
+ print(msg)
3541
+ self.logger.warning(msg)
3542
+ self.warning_counter.increment()
3438
3543
 
3439
3544
  self.logger.info(f"Prepared search keys: {valid_search_keys}")
3440
3545
 
@@ -3544,61 +3649,68 @@ class FeaturesEnricher(TransformerMixin):
3544
3649
  def check_need_detect(search_key: SearchKey):
3545
3650
  return not is_transform or search_key in self.fit_search_keys.values()
3546
3651
 
3547
- if SearchKey.POSTAL_CODE not in search_keys.values() and check_need_detect(SearchKey.POSTAL_CODE):
3548
- maybe_key = PostalCodeSearchKeyDetector().get_search_key_column(sample)
3549
- if maybe_key is not None:
3550
- search_keys[maybe_key] = SearchKey.POSTAL_CODE
3551
- self.autodetected_search_keys[maybe_key] = SearchKey.POSTAL_CODE
3552
- self.logger.info(f"Autodetected search key POSTAL_CODE in column {maybe_key}")
3652
+ # if SearchKey.POSTAL_CODE not in search_keys.values() and check_need_detect(SearchKey.POSTAL_CODE):
3653
+ if check_need_detect(SearchKey.POSTAL_CODE):
3654
+ maybe_keys = PostalCodeSearchKeyDetector().get_search_key_columns(sample, search_keys)
3655
+ if maybe_keys:
3656
+ new_keys = {key: SearchKey.POSTAL_CODE for key in maybe_keys}
3657
+ search_keys.update(new_keys)
3658
+ self.autodetected_search_keys.update(new_keys)
3659
+ self.logger.info(f"Autodetected search key POSTAL_CODE in column {maybe_keys}")
3553
3660
  if not silent_mode:
3554
- print(self.bundle.get("postal_code_detected").format(maybe_key))
3661
+ print(self.bundle.get("postal_code_detected").format(maybe_keys))
3555
3662
 
3556
3663
  if (
3557
3664
  SearchKey.COUNTRY not in search_keys.values()
3558
3665
  and self.country_code is None
3559
3666
  and check_need_detect(SearchKey.COUNTRY)
3560
3667
  ):
3561
- maybe_key = CountrySearchKeyDetector().get_search_key_column(sample)
3562
- if maybe_key is not None:
3563
- search_keys[maybe_key] = SearchKey.COUNTRY
3564
- self.autodetected_search_keys[maybe_key] = SearchKey.COUNTRY
3668
+ maybe_key = CountrySearchKeyDetector().get_search_key_columns(sample, search_keys)
3669
+ if maybe_key:
3670
+ search_keys[maybe_key[0]] = SearchKey.COUNTRY
3671
+ self.autodetected_search_keys[maybe_key[0]] = SearchKey.COUNTRY
3565
3672
  self.logger.info(f"Autodetected search key COUNTRY in column {maybe_key}")
3566
3673
  if not silent_mode:
3567
3674
  print(self.bundle.get("country_detected").format(maybe_key))
3568
3675
 
3569
3676
  if (
3570
- SearchKey.EMAIL not in search_keys.values()
3571
- and SearchKey.HEM not in search_keys.values()
3677
+ # SearchKey.EMAIL not in search_keys.values()
3678
+ SearchKey.HEM not in search_keys.values()
3572
3679
  and check_need_detect(SearchKey.HEM)
3573
3680
  ):
3574
- maybe_key = EmailSearchKeyDetector().get_search_key_column(sample)
3575
- if maybe_key is not None and maybe_key not in search_keys.keys():
3681
+ maybe_keys = EmailSearchKeyDetector().get_search_key_columns(sample, search_keys)
3682
+ if maybe_keys:
3576
3683
  if self.__is_registered or is_demo_dataset:
3577
- search_keys[maybe_key] = SearchKey.EMAIL
3578
- self.autodetected_search_keys[maybe_key] = SearchKey.EMAIL
3579
- self.logger.info(f"Autodetected search key EMAIL in column {maybe_key}")
3684
+ new_keys = {key: SearchKey.EMAIL for key in maybe_keys}
3685
+ search_keys.update(new_keys)
3686
+ self.autodetected_search_keys.update(new_keys)
3687
+ self.logger.info(f"Autodetected search key EMAIL in column {maybe_keys}")
3580
3688
  if not silent_mode:
3581
- print(self.bundle.get("email_detected").format(maybe_key))
3689
+ print(self.bundle.get("email_detected").format(maybe_keys))
3582
3690
  else:
3583
3691
  self.logger.warning(
3584
- f"Autodetected search key EMAIL in column {maybe_key}. But not used because not registered user"
3692
+ f"Autodetected search key EMAIL in column {maybe_keys}."
3693
+ " But not used because not registered user"
3585
3694
  )
3586
3695
  if not silent_mode:
3587
- print(self.bundle.get("email_detected_not_registered").format(maybe_key))
3696
+ print(self.bundle.get("email_detected_not_registered").format(maybe_keys))
3588
3697
  self.warning_counter.increment()
3589
3698
 
3590
- if SearchKey.PHONE not in search_keys.values() and check_need_detect(SearchKey.PHONE):
3591
- maybe_key = PhoneSearchKeyDetector().get_search_key_column(sample)
3592
- if maybe_key is not None and maybe_key not in search_keys.keys():
3699
+ # if SearchKey.PHONE not in search_keys.values() and check_need_detect(SearchKey.PHONE):
3700
+ if check_need_detect(SearchKey.PHONE):
3701
+ maybe_keys = PhoneSearchKeyDetector().get_search_key_columns(sample, search_keys)
3702
+ if maybe_keys:
3593
3703
  if self.__is_registered or is_demo_dataset:
3594
- search_keys[maybe_key] = SearchKey.PHONE
3595
- self.autodetected_search_keys[maybe_key] = SearchKey.PHONE
3596
- self.logger.info(f"Autodetected search key PHONE in column {maybe_key}")
3704
+ new_keys = {key: SearchKey.PHONE for key in maybe_keys}
3705
+ search_keys.update(new_keys)
3706
+ self.autodetected_search_keys.update(new_keys)
3707
+ self.logger.info(f"Autodetected search key PHONE in column {maybe_keys}")
3597
3708
  if not silent_mode:
3598
- print(self.bundle.get("phone_detected").format(maybe_key))
3709
+ print(self.bundle.get("phone_detected").format(maybe_keys))
3599
3710
  else:
3600
3711
  self.logger.warning(
3601
- f"Autodetected search key PHONE in column {maybe_key}. But not used because not registered user"
3712
+ f"Autodetected search key PHONE in column {maybe_keys}. "
3713
+ "But not used because not registered user"
3602
3714
  )
3603
3715
  if not silent_mode:
3604
3716
  print(self.bundle.get("phone_detected_not_registered"))
upgini/metadata.py CHANGED
@@ -4,6 +4,8 @@ from typing import Dict, List, Optional, Set
4
4
  from pydantic import BaseModel
5
5
 
6
6
  SYSTEM_RECORD_ID = "system_record_id"
7
+ ENTITY_SYSTEM_RECORD_ID = "entity_system_record_id"
8
+ SEARCH_KEY_UNNEST = "search_key_unnest"
7
9
  SORT_ID = "sort_id"
8
10
  EVAL_SET_INDEX = "eval_set_index"
9
11
  TARGET = "target"
@@ -11,7 +13,7 @@ COUNTRY = "country_iso_code"
11
13
  RENAMED_INDEX = "index_col"
12
14
  DEFAULT_INDEX = "index"
13
15
  ORIGINAL_INDEX = "original_index"
14
- SYSTEM_COLUMNS = {SYSTEM_RECORD_ID, EVAL_SET_INDEX, TARGET, COUNTRY, SORT_ID}
16
+ SYSTEM_COLUMNS = {SYSTEM_RECORD_ID, ENTITY_SYSTEM_RECORD_ID, SEARCH_KEY_UNNEST, EVAL_SET_INDEX, TARGET, COUNTRY}
15
17
 
16
18
 
17
19
  class FileColumnMeaningType(Enum):
@@ -37,6 +39,8 @@ class FileColumnMeaningType(Enum):
37
39
  POSTAL_CODE = "POSTAL_CODE"
38
40
  SYSTEM_RECORD_ID = "SYSTEM_RECORD_ID"
39
41
  EVAL_SET_INDEX = "EVAL_SET_INDEX"
42
+ ENTITY_SYSTEM_RECORD_ID = "ENTITY_SYSTEM_RECORD_ID"
43
+ UNNEST_KEY = "UNNEST_KEY"
40
44
 
41
45
 
42
46
  class SearchKey(Enum):
@@ -182,6 +186,10 @@ class FileColumnMetadata(BaseModel):
182
186
  meaningType: FileColumnMeaningType
183
187
  minMaxValues: Optional[NumericInterval] = None
184
188
  originalName: Optional[str]
189
+ # is this column contains keys from multiple key columns like msisdn1, msisdn2
190
+ isUnnest: bool = False
191
+ # list of original etalon key column names like msisdn1, msisdn2
192
+ unnestKeyNames: Optional[list[str]]
185
193
 
186
194
 
187
195
  class FileMetadata(BaseModel):
@@ -276,7 +284,7 @@ class FeaturesFilter(BaseModel):
276
284
 
277
285
 
278
286
  class RuntimeParameters(BaseModel):
279
- properties: Dict[str, str] = dict()
287
+ properties: Dict[str, str] = {}
280
288
 
281
289
 
282
290
  class SearchCustomization(BaseModel):
upgini/metrics.py CHANGED
@@ -357,7 +357,7 @@ class EstimatorWrapper:
357
357
  "logger": logger,
358
358
  }
359
359
  if estimator is None:
360
- params = dict()
360
+ params = {}
361
361
  # if metric_name.upper() in SUPPORTED_CATBOOST_METRICS:
362
362
  # params["eval_metric"] = SUPPORTED_CATBOOST_METRICS[metric_name.upper()]
363
363
  if target_type == ModelTaskType.MULTICLASS:
@@ -88,6 +88,7 @@ unsupported_search_key_type=Unsupported type of key in search_keys: {}
88
88
  search_key_country_and_country_code=\nWARNING: SearchKey.COUNTRY and country_code parameter were passed simultaniously. Parameter country_code will be ignored
89
89
  empty_search_key=Search key {} is empty. Please fill values or remove this search key
90
90
  single_constant_search_key=\nWARNING: Constant value detected for the {} search key in the X dataframe: {}.\nThat search key will add constant features for different y values.\nPlease add extra search keys with non constant values, like the COUNTRY, POSTAL_CODE, DATE, PHONE NUMBER, EMAIL/HEM or IPv4
91
+ unsupported_multi_key=Search key {} cannot be used multiple times
91
92
  unsupported_index_column=\nWARNING: Your column with name `index` was dropped because it's reserved name is booked for system needs.
92
93
  date_string_without_format=Date column `{}` has string type, but date_format is not specified. Convert column to datetime type or pass date_format
93
94
  invalid_date_format=Failed to parse date in column `{}`. Try to pass explicit date format in date_format argument of FeaturesEnricher constructor
@@ -1,4 +1,4 @@
1
- from typing import List, Optional
1
+ from typing import List
2
2
 
3
3
  import pandas as pd
4
4
 
@@ -10,16 +10,18 @@ class BaseSearchKeyDetector:
10
10
  def _is_search_key_by_values(self, column: pd.Series) -> bool:
11
11
  raise NotImplementedError()
12
12
 
13
- def _get_search_key_by_name(self, column_names: List[str]) -> Optional[str]:
14
- for column_name in column_names:
15
- if self._is_search_key_by_name(column_name):
16
- return column_name
13
+ def _get_search_keys_by_name(self, column_names: List[str]) -> List[str]:
14
+ return [
15
+ column_name
16
+ for column_name in column_names
17
+ if self._is_search_key_by_name(column_name)
18
+ ]
17
19
 
18
- def get_search_key_column(self, df: pd.DataFrame) -> Optional[str]:
19
- maybe_column = self._get_search_key_by_name(df.columns.to_list())
20
- if maybe_column is not None:
21
- return maybe_column
22
-
23
- for column_name in df.columns:
20
+ def get_search_key_columns(self, df: pd.DataFrame, existing_search_keys: List[str]) -> List[str]:
21
+ other_columns = [col for col in df.columns if col not in existing_search_keys]
22
+ columns_by_names = self._get_search_keys_by_name(other_columns)
23
+ columns_by_values = []
24
+ for column_name in other_columns:
24
25
  if self._is_search_key_by_values(df[column_name]):
25
- return column_name
26
+ columns_by_values.append(column_name)
27
+ return list(set(columns_by_names + columns_by_values))
@@ -126,9 +126,9 @@ class DateTimeSearchKeyConverter:
126
126
  df.drop(columns=seconds, inplace=True)
127
127
 
128
128
  if keep_time:
129
- df[self.DATETIME_COL] = df[self.date_column].view(np.int64) // 1_000_000
129
+ df[self.DATETIME_COL] = df[self.date_column].astype(np.int64) // 1_000_000
130
130
  df[self.DATETIME_COL] = df[self.DATETIME_COL].apply(self._int_to_opt).astype("Int64")
131
- df[self.date_column] = df[self.date_column].dt.floor("D").view(np.int64) // 1_000_000
131
+ df[self.date_column] = df[self.date_column].dt.floor("D").astype(np.int64) // 1_000_000
132
132
  df[self.date_column] = df[self.date_column].apply(self._int_to_opt).astype("Int64")
133
133
 
134
134
  self.logger.info(f"Date after convertion to timestamp: {df[self.date_column]}")
@@ -249,7 +249,8 @@ def validate_dates_distribution(
249
249
  if col in search_keys:
250
250
  continue
251
251
  try:
252
- pd.to_datetime(X[col])
252
+ # Format mixed to avoid massive warnings
253
+ pd.to_datetime(X[col], format="mixed")
253
254
  maybe_date_col = col
254
255
  break
255
256
  except Exception:
@@ -3,7 +3,15 @@ from typing import Dict, List, Optional, Union
3
3
 
4
4
  import pandas as pd
5
5
 
6
- from upgini.metadata import EVAL_SET_INDEX, SORT_ID, SYSTEM_RECORD_ID, TARGET, ModelTaskType, SearchKey
6
+ from upgini.metadata import (
7
+ ENTITY_SYSTEM_RECORD_ID,
8
+ EVAL_SET_INDEX,
9
+ SORT_ID,
10
+ SYSTEM_RECORD_ID,
11
+ TARGET,
12
+ ModelTaskType,
13
+ SearchKey,
14
+ )
7
15
  from upgini.resource_bundle import ResourceBundle
8
16
  from upgini.utils.datetime_utils import DateTimeSearchKeyConverter
9
17
  from upgini.utils.target_utils import define_task
@@ -143,6 +151,8 @@ def clean_full_duplicates(
143
151
  unique_columns = df.columns.tolist()
144
152
  if SYSTEM_RECORD_ID in unique_columns:
145
153
  unique_columns.remove(SYSTEM_RECORD_ID)
154
+ if ENTITY_SYSTEM_RECORD_ID in unique_columns:
155
+ unique_columns.remove(ENTITY_SYSTEM_RECORD_ID)
146
156
  if SORT_ID in unique_columns:
147
157
  unique_columns.remove(SORT_ID)
148
158
  if EVAL_SET_INDEX in unique_columns:
@@ -38,11 +38,13 @@ class EmailSearchKeyConverter:
38
38
  email_column: str,
39
39
  hem_column: Optional[str],
40
40
  search_keys: Dict[str, SearchKey],
41
+ unnest_search_keys: Optional[List[str]] = None,
41
42
  logger: Optional[logging.Logger] = None,
42
43
  ):
43
44
  self.email_column = email_column
44
45
  self.hem_column = hem_column
45
46
  self.search_keys = search_keys
47
+ self.unnest_search_keys = unnest_search_keys
46
48
  if logger is not None:
47
49
  self.logger = logger
48
50
  else:
@@ -80,9 +82,12 @@ class EmailSearchKeyConverter:
80
82
  del self.search_keys[self.email_column]
81
83
  return df
82
84
  self.search_keys[self.HEM_COLUMN_NAME] = SearchKey.HEM
85
+ self.unnest_search_keys.append(self.HEM_COLUMN_NAME)
83
86
  self.email_converted_to_hem = True
84
87
 
85
88
  del self.search_keys[self.email_column]
89
+ if self.email_column in self.unnest_search_keys:
90
+ self.unnest_search_keys.remove(self.email_column)
86
91
 
87
92
  df[self.EMAIL_ONE_DOMAIN_COLUMN_NAME] = df[self.email_column].apply(self._email_to_one_domain)
88
93
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: upgini
3
- Version: 1.1.276
3
+ Version: 1.1.278a1
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Home-page: https://upgini.com/
6
6
  Author: Upgini Developers
@@ -1,12 +1,12 @@
1
1
  upgini/__init__.py,sha256=asENHgEVHQBIkV-e_0IhE_ZWqkCG6398U3ZLrNzAH6k,407
2
2
  upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
3
- upgini/dataset.py,sha256=HwL2syoMf3F9k9SmsJJMhhqnAddZcx28RZ1aYam7Lhs,45665
3
+ upgini/dataset.py,sha256=qdIxHiDGZT_iNTBswNeIuc9TPfvUlNqvSmRqMyigZBM,46187
4
4
  upgini/errors.py,sha256=pdzQl3MKuK52yvncxMWMRWeSIOGhUFzpQoszoRFBOk0,958
5
- upgini/features_enricher.py,sha256=ys7RQoZsyY8-NkUZyp12K8z5aQmg7pyx0LtwclFtXkc,176358
5
+ upgini/features_enricher.py,sha256=wsrm9uwIr7hCNLBXTEuw4nIuIXfJrsZ7RWFeG24tTzI,181665
6
6
  upgini/fingerprint.js,sha256=VygVIQlN1v4NGZfjHqtRogOw8zjTnnMNJg_f7M5iGQU,33442
7
7
  upgini/http.py,sha256=zaO86LBBLmkieGbgYifk29eVoPCxXimZQ8YkQtKcM0I,42244
8
- upgini/metadata.py,sha256=fwVxtkR6Mn4iRoOqV6BfMJvJrx65I3YwZUMbZjhPyOI,9673
9
- upgini/metrics.py,sha256=tGzdn0jgup86OlH_GS4eoza8ZJZ9wgaJr7SaX3Upwzo,29652
8
+ upgini/metadata.py,sha256=TNZbtIuxYkBFGQu3gGm2flA6vsKyUPN4Q-Du3fFjmSM,10101
9
+ upgini/metrics.py,sha256=YhyPik38cBI5x5KfdiE_qocJnUjZbSqUj8GUtCqnG0g,29648
10
10
  upgini/search_task.py,sha256=tmJ17WUxv3J5NWrYUJB_NKdZ792Ifz8Z8UnDXeQnpss,17077
11
11
  upgini/spinner.py,sha256=Dm1dQ5F_z_Ua2odLxZX7OypcOX9tSx_vE5MGaKtUmfw,1118
12
12
  upgini/version_validator.py,sha256=rDIncP6BEko4J2F2hUcMOtKm_vZbI4ICWcNcw8hrwM4,1400
@@ -29,22 +29,22 @@ upgini/normalizer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU
29
29
  upgini/normalizer/phone_normalizer.py,sha256=_SYMX4GTgwzRXArK54Jp3vUBE5d4jZxSVyze-0tqzg0,9996
30
30
  upgini/resource_bundle/__init__.py,sha256=hdvbqL0b0xMWbY6-kiYGsW1ro2GMiWpxxsO9uCv-h9Q,8379
31
31
  upgini/resource_bundle/exceptions.py,sha256=5fRvx0_vWdE1-7HcSgF0tckB4A9AKyf5RiinZkInTsI,621
32
- upgini/resource_bundle/strings.properties,sha256=1O779a0-Ai0j7W-Z5AznvjuV69YkJvgGhJda-6VMLOQ,26287
32
+ upgini/resource_bundle/strings.properties,sha256=-JDIa0nAoA5utK7UZZAUgLDsozJNI08dDcbIaOSsvQg,26353
33
33
  upgini/resource_bundle/strings_widget.properties,sha256=gOdqvZWntP2LCza_tyVk1_yRYcG4c04K9sQOAVhF_gw,1577
34
34
  upgini/sampler/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
35
35
  upgini/sampler/base.py,sha256=CC-DvPbrN7zp5--SVFuUqkVmdWM_5F7R0Do98ETV82U,6421
36
36
  upgini/sampler/random_under_sampler.py,sha256=XU4c2swPIFxVXHOPpxgM2bUao0Xm-aoMmd6fKjIuV5s,4068
37
37
  upgini/sampler/utils.py,sha256=PYOk3kKSnFlyxcpdtDNLBEEhTB4lO_iP7pQHqeUcmAc,20211
38
38
  upgini/utils/__init__.py,sha256=YVum3lRKpyfqoJy_7HJyU6SmIgbmG8QLkHIpibE_ud8,842
39
- upgini/utils/base_search_key_detector.py,sha256=DGwhXLvc8i5VZWMDr0rncFfV5GEHdsCSnLGon_W9TPs,859
39
+ upgini/utils/base_search_key_detector.py,sha256=VvEdamjJT1wypsH6NAfOkPp7dHo7nxhl7LhwX7Z9N5w,1025
40
40
  upgini/utils/blocked_time_series.py,sha256=dMz5ewk3PsoeOrc3lDzInCVPS9u_2XQkV0W6PuMMjPg,3380
41
41
  upgini/utils/country_utils.py,sha256=pV8TBURthYqwSOfH1lxfYc2blm3OvfLFCMvRv8rKTp4,6511
42
42
  upgini/utils/custom_loss_utils.py,sha256=DBslpjWGPt7xTeypt78baR59012SYphbPsO_YLKdilo,3972
43
43
  upgini/utils/cv_utils.py,sha256=Tn01RJvpZGZh0PUQUimlBkV-AXwe7s6yjCNFtw352Uc,3525
44
- upgini/utils/datetime_utils.py,sha256=_mfhWb5ogEThvanQ-py1Lb6VvUvF2vT20tQgNprNz6o,10321
45
- upgini/utils/deduplicate_utils.py,sha256=6AbARehUCghJZ4PppFtrej2s3gFRruh41MEm6mzakHs,8607
44
+ upgini/utils/datetime_utils.py,sha256=RW9eGCGQyYBsIU9XbYKt4hQiXUNppb4Grszg4EdKeY4,10398
45
+ upgini/utils/deduplicate_utils.py,sha256=Zvs7zW4QzaERQmJNPrTVf2ZTVBkBLOycFCzyMwtXuV8,8770
46
46
  upgini/utils/display_utils.py,sha256=LKoSwjrE0xgS5_cqVhc2og2CQ1UCZ1nTI2VKboIhoQA,10858
47
- upgini/utils/email_utils.py,sha256=R9bVOfbS-oVkA8PdwZfQBxm7B4mQlRtkwqx2cf6zPCY,3520
47
+ upgini/utils/email_utils.py,sha256=KHqIUagBWd3jOj3V7mW0ZkBOc-2XzAIA3p1xxZgy-L4,3813
48
48
  upgini/utils/fallback_progress_bar.py,sha256=cdbd1XGcWm4Ed4eAqV2_St3z7uC_kkH22gEyrN5ub6M,1090
49
49
  upgini/utils/features_validator.py,sha256=PgKNt5dyqfErTvjtRNNUS9g7GFqHBtAtnsfA-V5UO1A,3307
50
50
  upgini/utils/format.py,sha256=Yv5cvvSs2bOLUzzNu96Pu33VMDNbabio92QepUj41jU,243
@@ -56,8 +56,8 @@ upgini/utils/sklearn_ext.py,sha256=e1aMNXk1zUt7uFnl0FcUF0zOnaXSE7z5xBHmJPknUVs,4
56
56
  upgini/utils/target_utils.py,sha256=Y96_PJ5cC-WsEbeqg20v9uqywDQobLoTb-xoP7S3o4E,7807
57
57
  upgini/utils/track_info.py,sha256=p8gmuHhLamZF5JG7K9DeK-PcytQhlFCR29lyRr-wq_U,5665
58
58
  upgini/utils/warning_counter.py,sha256=dIWBB4dI5XRRJZudvIlqlIYKEiwLLPcXarsZuYRt338,227
59
- upgini-1.1.276.dist-info/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
60
- upgini-1.1.276.dist-info/METADATA,sha256=Dgb4UJ82UknhtKS9DHiGRu-a9i3LeoKZiVWpCzkJfF4,48156
61
- upgini-1.1.276.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
62
- upgini-1.1.276.dist-info/top_level.txt,sha256=OFhTGiDIWKl5gFI49qvWq1R9IKflPaE2PekcbDXDtx4,7
63
- upgini-1.1.276.dist-info/RECORD,,
59
+ upgini-1.1.278a1.dist-info/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
60
+ upgini-1.1.278a1.dist-info/METADATA,sha256=q6o1ge7o56ZvJk11K_v2tfPAREHvZMW9kHPbotGeJEo,48158
61
+ upgini-1.1.278a1.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
62
+ upgini-1.1.278a1.dist-info/top_level.txt,sha256=OFhTGiDIWKl5gFI49qvWq1R9IKflPaE2PekcbDXDtx4,7
63
+ upgini-1.1.278a1.dist-info/RECORD,,