upgini 1.2.86a2__py3-none-any.whl → 1.2.87__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
upgini/__about__.py CHANGED
@@ -1 +1 @@
1
- __version__ = "1.2.86a2"
1
+ __version__ = "1.2.87"
@@ -5,6 +5,8 @@ from datetime import datetime
5
5
  from enum import Enum
6
6
  from typing import Dict, List, Literal, Optional, Union
7
7
 
8
+ import pandas as pd
9
+
8
10
  from upgini.errors import HttpError, ValidationError
9
11
  from upgini.http import LoggerFactory, get_rest_client
10
12
  from upgini.mdc import MDC
@@ -137,6 +139,25 @@ class DataSourcePublisher:
137
139
  ) and not date_format:
138
140
  raise ValidationError("date_format argument is required for PHONE+DATE and HEM+DATE search keys")
139
141
 
142
+ if secondary_search_keys:
143
+ response = self._rest_client.get_active_ads_definitions()
144
+ definitions = pd.DataFrame(response["adsDefinitions"])
145
+ prod_secondary_definitions = definitions.query(
146
+ "(secondarySearchKeys.astype('string') != '[]') & (adsDefinitionAccessType == 'PROD')"
147
+ )[["name", "searchKeys", "secondarySearchKeys"]]
148
+ for _, row in prod_secondary_definitions.iterrows():
149
+ existing_secondary_keys = {item for sublist in row["secondarySearchKeys"] for item in sublist}
150
+ if existing_secondary_keys == {v.value.name for v in secondary_search_keys.values()}:
151
+ existing_search_keys = {item for sublist in row["searchKeys"] for item in sublist}
152
+ if (
153
+ existing_search_keys == {v.value.name for v in search_keys.values()}
154
+ or ("IP" in str(existing_search_keys) and "IP" in str(search_keys.values()))
155
+ ):
156
+ raise ValidationError(
157
+ "ADS with the same PRIMARY_KEYS -> SECONDARY_KEYS mapping "
158
+ f"already exists: {row['name']}"
159
+ )
160
+
140
161
  request = {
141
162
  "dataTableUri": data_table_uri,
142
163
  "searchKeys": {k: v.value.value for k, v in search_keys.items()},
@@ -30,7 +30,7 @@ from pandas.api.types import (
30
30
  from scipy.stats import ks_2samp
31
31
  from sklearn.base import TransformerMixin
32
32
  from sklearn.exceptions import NotFittedError
33
- from sklearn.model_selection import BaseCrossValidator
33
+ from sklearn.model_selection import BaseCrossValidator, TimeSeriesSplit
34
34
 
35
35
  from upgini.autofe.feature import Feature
36
36
  from upgini.autofe.timeseries import TimeSeriesBase
@@ -71,6 +71,7 @@ from upgini.resource_bundle import ResourceBundle, bundle, get_custom_bundle
71
71
  from upgini.search_task import SearchTask
72
72
  from upgini.spinner import Spinner
73
73
  from upgini.utils import combine_search_keys, find_numbers_with_decimal_comma
74
+ from upgini.utils.blocked_time_series import BlockedTimeSeriesSplit
74
75
  from upgini.utils.country_utils import (
75
76
  CountrySearchKeyConverter,
76
77
  CountrySearchKeyDetector,
@@ -114,7 +115,9 @@ from upgini.utils.postal_code_utils import (
114
115
  try:
115
116
  from upgini.utils.progress_bar import CustomProgressBar as ProgressBar
116
117
  except Exception:
117
- from upgini.utils.fallback_progress_bar import CustomFallbackProgressBar as ProgressBar
118
+ from upgini.utils.fallback_progress_bar import (
119
+ CustomFallbackProgressBar as ProgressBar,
120
+ )
118
121
 
119
122
  from upgini.utils.sort import sort_columns
120
123
  from upgini.utils.target_utils import (
@@ -239,6 +242,7 @@ class FeaturesEnricher(TransformerMixin):
239
242
  add_date_if_missing: bool = True,
240
243
  disable_force_downsampling: bool = False,
241
244
  id_columns: Optional[List[str]] = None,
245
+ generate_search_key_features: bool = True,
242
246
  **kwargs,
243
247
  ):
244
248
  self.bundle = get_custom_bundle(custom_bundle_config)
@@ -296,7 +300,7 @@ class FeaturesEnricher(TransformerMixin):
296
300
  self._relevant_data_sources_wo_links: pd.DataFrame = self.EMPTY_DATA_SOURCES
297
301
  self.metrics: Optional[pd.DataFrame] = None
298
302
  self.feature_names_ = []
299
- self.dropped_client_feature_names_ = []
303
+ self.zero_shap_client_features = []
300
304
  self.feature_importances_ = []
301
305
  self.search_id = search_id
302
306
  self.disable_force_downsampling = disable_force_downsampling
@@ -311,7 +315,7 @@ class FeaturesEnricher(TransformerMixin):
311
315
  self.logger.debug(f"FeaturesEnricher created from existing search: {search_id}")
312
316
  self._search_task = search_task.poll_result(trace_id, quiet=True, check_fit=True)
313
317
  file_metadata = self._search_task.get_file_metadata(trace_id)
314
- x_columns = [c.originalName or c.name for c in file_metadata.columns]
318
+ x_columns = [c.name for c in file_metadata.columns]
315
319
  self.fit_columns_renaming = {c.name: c.originalName for c in file_metadata.columns}
316
320
  df = pd.DataFrame(columns=x_columns)
317
321
  self.__prepare_feature_importances(trace_id, df, silent=True)
@@ -365,6 +369,8 @@ class FeaturesEnricher(TransformerMixin):
365
369
  self.exclude_columns = exclude_columns
366
370
  self.baseline_score_column = baseline_score_column
367
371
  self.add_date_if_missing = add_date_if_missing
372
+ self.generate_search_key_features = generate_search_key_features
373
+
368
374
  self.features_info_display_handle = None
369
375
  self.data_sources_display_handle = None
370
376
  self.autofe_features_display_handle = None
@@ -1045,6 +1051,7 @@ class FeaturesEnricher(TransformerMixin):
1045
1051
  self._check_train_and_eval_target_distribution(y_sorted, fitting_eval_set_dict)
1046
1052
 
1047
1053
  has_date = self._get_date_column(search_keys) is not None
1054
+ has_time = has_date and isinstance(_cv, TimeSeriesSplit) or isinstance(_cv, BlockedTimeSeriesSplit)
1048
1055
  model_task_type = self.model_task_type or define_task(y_sorted, has_date, self.logger, silent=True)
1049
1056
  cat_features = list(set(client_cat_features + cat_features_from_backend))
1050
1057
  baseline_cat_features = [f for f in cat_features if f in fitting_X.columns]
@@ -1077,7 +1084,7 @@ class FeaturesEnricher(TransformerMixin):
1077
1084
  add_params=custom_loss_add_params,
1078
1085
  groups=groups,
1079
1086
  text_features=text_features,
1080
- has_date=has_date,
1087
+ has_time=has_time,
1081
1088
  )
1082
1089
  baseline_cv_result = baseline_estimator.cross_val_predict(
1083
1090
  fitting_X, y_sorted, baseline_score_column
@@ -1112,7 +1119,7 @@ class FeaturesEnricher(TransformerMixin):
1112
1119
  add_params=custom_loss_add_params,
1113
1120
  groups=groups,
1114
1121
  text_features=text_features,
1115
- has_date=has_date,
1122
+ has_time=has_time,
1116
1123
  )
1117
1124
  enriched_cv_result = enriched_estimator.cross_val_predict(fitting_enriched_X, enriched_y_sorted)
1118
1125
  enriched_metric = enriched_cv_result.get_display_metric()
@@ -1773,7 +1780,13 @@ class FeaturesEnricher(TransformerMixin):
1773
1780
  date_column = self._get_date_column(search_keys)
1774
1781
  generated_features = []
1775
1782
  if date_column is not None:
1776
- converter = DateTimeSearchKeyConverter(date_column, self.date_format, self.logger, self.bundle)
1783
+ converter = DateTimeSearchKeyConverter(
1784
+ date_column,
1785
+ self.date_format,
1786
+ self.logger,
1787
+ self.bundle,
1788
+ generate_cyclical_features=self.generate_search_key_features,
1789
+ )
1777
1790
  # Leave original date column values
1778
1791
  df_with_date_features = converter.convert(df, keep_time=True)
1779
1792
  df_with_date_features[date_column] = df[date_column]
@@ -1781,7 +1794,7 @@ class FeaturesEnricher(TransformerMixin):
1781
1794
  generated_features = converter.generated_features
1782
1795
 
1783
1796
  email_columns = SearchKey.find_all_keys(search_keys, SearchKey.EMAIL)
1784
- if email_columns:
1797
+ if email_columns and self.generate_search_key_features:
1785
1798
  generator = EmailDomainGenerator(email_columns)
1786
1799
  df = generator.generate(df)
1787
1800
  generated_features.extend(generator.generated_features)
@@ -2204,10 +2217,12 @@ class FeaturesEnricher(TransformerMixin):
2204
2217
  {"name": name, "value": key_example(sk_type)} for name in sk_meta.unnestKeyNames
2205
2218
  ]
2206
2219
  else:
2207
- search_keys_with_values[sk_type.name] = [{
2208
- "name": sk_meta.originalName,
2209
- "value": key_example(sk_type),
2210
- }]
2220
+ search_keys_with_values[sk_type.name] = [
2221
+ {
2222
+ "name": sk_meta.originalName,
2223
+ "value": key_example(sk_type),
2224
+ }
2225
+ ]
2211
2226
 
2212
2227
  keys_section = json.dumps(search_keys_with_values)
2213
2228
  features_for_transform = self._search_task.get_features_for_transform()
@@ -2284,11 +2299,16 @@ if response.status_code == 200:
2284
2299
 
2285
2300
  self.__log_debug_information(validated_X, validated_y, exclude_features_sources=exclude_features_sources)
2286
2301
 
2287
- self.__validate_search_keys(self.search_keys, self.search_id)
2302
+ filtered_columns = self.__filtered_enriched_features(
2303
+ importance_threshold, max_features, trace_id, validated_X
2304
+ )
2305
+ # If there are no important features, return original dataframe
2306
+ if not filtered_columns:
2307
+ msg = self.bundle.get("no_important_features_for_transform")
2308
+ self.__log_warning(msg, show_support_link=True)
2309
+ return X, {c: c for c in X.columns}, [], dict()
2288
2310
 
2289
- if len(self.feature_names_) == 0:
2290
- self.logger.warning(self.bundle.get("no_important_features_for_transform"))
2291
- return X, {c: c for c in X.columns}, [], {}
2311
+ self.__validate_search_keys(self.search_keys, self.search_id)
2292
2312
 
2293
2313
  if self._has_paid_features(exclude_features_sources):
2294
2314
  msg = self.bundle.get("transform_with_paid_features")
@@ -2327,9 +2347,7 @@ if response.status_code == 200:
2327
2347
 
2328
2348
  is_demo_dataset = hash_input(df) in DEMO_DATASET_HASHES
2329
2349
 
2330
- columns_to_drop = [
2331
- c for c in df.columns if c in self.feature_names_ and c in self.dropped_client_feature_names_
2332
- ]
2350
+ columns_to_drop = [c for c in df.columns if c in self.feature_names_]
2333
2351
  if len(columns_to_drop) > 0:
2334
2352
  msg = self.bundle.get("x_contains_enriching_columns").format(columns_to_drop)
2335
2353
  self.logger.warning(msg)
@@ -2360,7 +2378,13 @@ if response.status_code == 200:
2360
2378
  generated_features = []
2361
2379
  date_column = self._get_date_column(search_keys)
2362
2380
  if date_column is not None:
2363
- converter = DateTimeSearchKeyConverter(date_column, self.date_format, self.logger, bundle=self.bundle)
2381
+ converter = DateTimeSearchKeyConverter(
2382
+ date_column,
2383
+ self.date_format,
2384
+ self.logger,
2385
+ bundle=self.bundle,
2386
+ generate_cyclical_features=self.generate_search_key_features,
2387
+ )
2364
2388
  df = converter.convert(df, keep_time=True)
2365
2389
  self.logger.info(f"Date column after convertion: {df[date_column]}")
2366
2390
  generated_features.extend(converter.generated_features)
@@ -2370,7 +2394,7 @@ if response.status_code == 200:
2370
2394
  df = self._add_current_date_as_key(df, search_keys, self.logger, self.bundle)
2371
2395
 
2372
2396
  email_columns = SearchKey.find_all_keys(search_keys, SearchKey.EMAIL)
2373
- if email_columns:
2397
+ if email_columns and self.generate_search_key_features:
2374
2398
  generator = EmailDomainGenerator(email_columns)
2375
2399
  df = generator.generate(df)
2376
2400
  generated_features.extend(generator.generated_features)
@@ -2379,6 +2403,17 @@ if response.status_code == 200:
2379
2403
  df, search_keys, generated_features = normalizer.normalize(df, search_keys, generated_features)
2380
2404
  columns_renaming = normalizer.columns_renaming
2381
2405
 
2406
+ # If there are no external features, we don't call backend on transform
2407
+ external_features = [fm for fm in features_meta if fm.shap_value > 0 and fm.source != "etalon"]
2408
+ if not external_features:
2409
+ self.logger.warning(
2410
+ "No external features found, returning original dataframe"
2411
+ f" with generated important features: {filtered_columns}"
2412
+ )
2413
+ filtered_columns = [c for c in filtered_columns if c in df.columns]
2414
+ self.logger.warning(f"Filtered columns by existance in dataframe: {filtered_columns}")
2415
+ return df[filtered_columns], columns_renaming, generated_features, search_keys
2416
+
2382
2417
  # Don't pass all features in backend on transform
2383
2418
  runtime_parameters = self._get_copy_of_runtime_parameters()
2384
2419
  features_for_transform = self._search_task.get_features_for_transform() or []
@@ -2423,6 +2458,8 @@ if response.status_code == 200:
2423
2458
  # Explode multiple search keys
2424
2459
  df, unnest_search_keys = self._explode_multiple_search_keys(df, search_keys, columns_renaming)
2425
2460
 
2461
+ # Convert search keys and generate features on them
2462
+
2426
2463
  email_column = self._get_email_column(search_keys)
2427
2464
  hem_column = self._get_hem_column(search_keys)
2428
2465
  if email_column:
@@ -2611,17 +2648,15 @@ if response.status_code == 200:
2611
2648
  how="left",
2612
2649
  )
2613
2650
 
2651
+ selected_generated_features = [
2652
+ c for c in generated_features if not self.fit_select_features or c in filtered_columns
2653
+ ]
2614
2654
  selecting_columns = [
2615
2655
  c
2616
- for c in itertools.chain(validated_Xy.columns.tolist(), generated_features)
2617
- if c not in self.dropped_client_feature_names_
2656
+ for c in itertools.chain(validated_Xy.columns.tolist(), selected_generated_features)
2657
+ if c not in self.zero_shap_client_features
2618
2658
  ]
2619
- filtered_columns = self.__filtered_enriched_features(
2620
- importance_threshold, max_features, trace_id, validated_X
2621
- )
2622
- selecting_columns.extend(
2623
- c for c in filtered_columns if c in result.columns and c not in validated_X.columns
2624
- )
2659
+ selecting_columns.extend(c for c in result.columns if c in filtered_columns and c not in selecting_columns)
2625
2660
  if add_fit_system_record_id:
2626
2661
  selecting_columns.append(SORT_ID)
2627
2662
 
@@ -2860,6 +2895,7 @@ if response.status_code == 200:
2860
2895
  self.date_format,
2861
2896
  self.logger,
2862
2897
  bundle=self.bundle,
2898
+ generate_cyclical_features=self.generate_search_key_features,
2863
2899
  )
2864
2900
  df = converter.convert(df, keep_time=True)
2865
2901
  if converter.has_old_dates:
@@ -2872,7 +2908,7 @@ if response.status_code == 200:
2872
2908
  df = self._add_current_date_as_key(df, self.fit_search_keys, self.logger, self.bundle)
2873
2909
 
2874
2910
  email_columns = SearchKey.find_all_keys(self.fit_search_keys, SearchKey.EMAIL)
2875
- if email_columns:
2911
+ if email_columns and self.generate_search_key_features:
2876
2912
  generator = EmailDomainGenerator(email_columns)
2877
2913
  df = generator.generate(df)
2878
2914
  self.fit_generated_features.extend(generator.generated_features)
@@ -2920,7 +2956,10 @@ if response.status_code == 200:
2920
2956
  self.__log_warning(fintech_warning)
2921
2957
  df, full_duplicates_warning = clean_full_duplicates(df, self.logger, bundle=self.bundle)
2922
2958
  if full_duplicates_warning:
2923
- self.__log_warning(full_duplicates_warning)
2959
+ if len(df) == 0:
2960
+ raise ValidationError(full_duplicates_warning)
2961
+ else:
2962
+ self.__log_warning(full_duplicates_warning)
2924
2963
 
2925
2964
  # Explode multiple search keys
2926
2965
  df = self.__add_fit_system_record_id(
@@ -3323,9 +3362,13 @@ if response.status_code == 200:
3323
3362
  Xy[TARGET] = y
3324
3363
  validated_y = Xy[TARGET].copy()
3325
3364
 
3326
- if validated_y.nunique() < 2:
3365
+ y_nunique = validated_y.nunique()
3366
+ if y_nunique < 2:
3327
3367
  raise ValidationError(self.bundle.get("y_is_constant"))
3328
3368
 
3369
+ if self.model_task_type == ModelTaskType.BINARY and y_nunique != 2:
3370
+ raise ValidationError(self.bundle.get("binary_target_unique_count_not_2").format(y_nunique))
3371
+
3329
3372
  return validated_y
3330
3373
 
3331
3374
  def _validate_eval_set_pair(self, X: pd.DataFrame, eval_pair: Tuple) -> Tuple[pd.DataFrame, pd.Series]:
@@ -3400,9 +3443,13 @@ if response.status_code == 200:
3400
3443
  else:
3401
3444
  raise ValidationError(self.bundle.get("unsupported_y_type_eval_set").format(type(eval_y)))
3402
3445
 
3403
- if validated_eval_y.nunique() < 2:
3446
+ eval_y_nunique = validated_eval_y.nunique()
3447
+ if eval_y_nunique < 2:
3404
3448
  raise ValidationError(self.bundle.get("y_is_constant_eval_set"))
3405
3449
 
3450
+ if self.model_task_type == ModelTaskType.BINARY and eval_y_nunique != 2:
3451
+ raise ValidationError(self.bundle.get("binary_target_eval_unique_count_not_2").format(eval_y_nunique))
3452
+
3406
3453
  return validated_eval_X, validated_eval_y
3407
3454
 
3408
3455
  def _validate_baseline_score(self, X: pd.DataFrame, eval_set: Optional[List[Tuple]]):
@@ -3564,7 +3611,9 @@ if response.status_code == 200:
3564
3611
  maybe_date_col = SearchKey.find_key(self.search_keys, [SearchKey.DATE, SearchKey.DATETIME])
3565
3612
  if X is not None and maybe_date_col is not None and maybe_date_col in X.columns:
3566
3613
  # TODO cast date column to single dtype
3567
- date_converter = DateTimeSearchKeyConverter(maybe_date_col, self.date_format)
3614
+ date_converter = DateTimeSearchKeyConverter(
3615
+ maybe_date_col, self.date_format, generate_cyclical_features=False
3616
+ )
3568
3617
  converted_X = date_converter.convert(X)
3569
3618
  min_date = converted_X[maybe_date_col].min()
3570
3619
  max_date = converted_X[maybe_date_col].max()
@@ -3603,7 +3652,7 @@ if response.status_code == 200:
3603
3652
  self.__log_warning(bundle.get("current_date_added"))
3604
3653
  df[FeaturesEnricher.CURRENT_DATE] = datetime.date.today()
3605
3654
  search_keys[FeaturesEnricher.CURRENT_DATE] = SearchKey.DATE
3606
- converter = DateTimeSearchKeyConverter(FeaturesEnricher.CURRENT_DATE)
3655
+ converter = DateTimeSearchKeyConverter(FeaturesEnricher.CURRENT_DATE, generate_cyclical_features=False)
3607
3656
  df = converter.convert(df)
3608
3657
  return df
3609
3658
 
@@ -3942,10 +3991,11 @@ if response.status_code == 200:
3942
3991
  original_names_dict = {c.name: c.originalName for c in self._search_task.get_file_metadata(trace_id).columns}
3943
3992
  features_df = self._search_task.get_all_initial_raw_features(trace_id, metrics_calculation=True)
3944
3993
 
3994
+ # To be sure that names with hash suffixes
3945
3995
  df = df.rename(columns=original_names_dict)
3946
3996
 
3947
3997
  self.feature_names_ = []
3948
- self.dropped_client_feature_names_ = []
3998
+ self.zero_shap_client_features = []
3949
3999
  self.feature_importances_ = []
3950
4000
  features_info = []
3951
4001
  features_info_without_links = []
@@ -3957,7 +4007,7 @@ if response.status_code == 200:
3957
4007
  if feature_meta.name in original_names_dict.keys():
3958
4008
  feature_meta.name = original_names_dict[feature_meta.name]
3959
4009
 
3960
- is_client_feature = feature_meta.name in df.columns
4010
+ is_client_feature = original_names_dict.get(feature_meta.name, feature_meta.name) in df.columns
3961
4011
 
3962
4012
  # Show and update shap values for client features only if select_features is True
3963
4013
  if updated_shaps is not None and (not is_client_feature or self.fit_select_features):
@@ -3973,13 +4023,13 @@ if response.status_code == 200:
3973
4023
  features_meta.sort(key=lambda m: (-m.shap_value, m.name))
3974
4024
 
3975
4025
  for feature_meta in features_meta:
3976
-
3977
- is_client_feature = feature_meta.name in df.columns
4026
+ original_name = original_names_dict.get(feature_meta.name, feature_meta.name)
4027
+ is_client_feature = original_name in df.columns
3978
4028
 
3979
4029
  # TODO make a decision about selected features based on special flag from mlb
3980
4030
  if original_shaps.get(feature_meta.name, 0.0) == 0.0:
3981
- if self.fit_select_features:
3982
- self.dropped_client_feature_names_.append(feature_meta.name)
4031
+ if is_client_feature and self.fit_select_features:
4032
+ self.zero_shap_client_features.append(original_name)
3983
4033
  continue
3984
4034
 
3985
4035
  # Use only important features