upgini 1.2.27__py3-none-any.whl → 1.2.29__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

upgini/__about__.py CHANGED
@@ -1 +1 @@
1
- __version__ = "1.2.27"
1
+ __version__ = "1.2.29"
@@ -54,6 +54,7 @@ from upgini.metadata import (
54
54
  SYSTEM_RECORD_ID,
55
55
  TARGET,
56
56
  CVType,
57
+ FeaturesMetadataV2,
57
58
  FileColumnMeaningType,
58
59
  ModelTaskType,
59
60
  RuntimeParameters,
@@ -95,6 +96,7 @@ from upgini.utils.email_utils import (
95
96
  EmailSearchKeyConverter,
96
97
  EmailSearchKeyDetector,
97
98
  )
99
+ from upgini.utils.feature_info import FeatureInfo, _round_shap_value
98
100
  from upgini.utils.features_validator import FeaturesValidator
99
101
  from upgini.utils.format import Format
100
102
  from upgini.utils.ip_utils import IpSearchKeyConverter
@@ -158,6 +160,10 @@ class FeaturesEnricher(TransformerMixin):
158
160
 
159
161
  shared_datasets: list of str, optional (default=None)
160
162
  List of private shared dataset ids for custom search
163
+
164
+ select_features: bool, optional (default=False)
165
+ If True, return only selected features both from input and data sources.
166
+ Otherwise, return all features from input and only selected features from data sources.
161
167
  """
162
168
 
163
169
  TARGET_NAME = "target"
@@ -224,6 +230,7 @@ class FeaturesEnricher(TransformerMixin):
224
230
  client_visitorid: Optional[str] = None,
225
231
  custom_bundle_config: Optional[str] = None,
226
232
  add_date_if_missing: bool = True,
233
+ select_features: bool = False,
227
234
  **kwargs,
228
235
  ):
229
236
  self.bundle = get_custom_bundle(custom_bundle_config)
@@ -277,8 +284,11 @@ class FeaturesEnricher(TransformerMixin):
277
284
  self._relevant_data_sources_wo_links: pd.DataFrame = self.EMPTY_DATA_SOURCES
278
285
  self.metrics: Optional[pd.DataFrame] = None
279
286
  self.feature_names_ = []
287
+ self.dropped_client_feature_names_ = []
280
288
  self.feature_importances_ = []
281
289
  self.search_id = search_id
290
+ self.select_features = select_features
291
+
282
292
  if search_id:
283
293
  search_task = SearchTask(search_id, rest_client=self.rest_client, logger=self.logger)
284
294
 
@@ -999,9 +1009,10 @@ class FeaturesEnricher(TransformerMixin):
999
1009
  text_features=self.generate_features,
1000
1010
  has_date=has_date,
1001
1011
  )
1002
- etalon_metric, _ = baseline_estimator.cross_val_predict(
1012
+ etalon_cv_result = baseline_estimator.cross_val_predict(
1003
1013
  fitting_X, y_sorted, self.baseline_score_column
1004
1014
  )
1015
+ etalon_metric = etalon_cv_result.get_display_metric()
1005
1016
  if etalon_metric is None:
1006
1017
  self.logger.info(
1007
1018
  f"Baseline {metric} on train client features is None (maybe all features was removed)"
@@ -1033,9 +1044,9 @@ class FeaturesEnricher(TransformerMixin):
1033
1044
  text_features=self.generate_features,
1034
1045
  has_date=has_date,
1035
1046
  )
1036
- enriched_metric, enriched_shaps = enriched_estimator.cross_val_predict(
1037
- fitting_enriched_X, enriched_y_sorted
1038
- )
1047
+ enriched_cv_result = enriched_estimator.cross_val_predict(fitting_enriched_X, enriched_y_sorted)
1048
+ enriched_metric = enriched_cv_result.get_display_metric()
1049
+ enriched_shaps = enriched_cv_result.shap_values
1039
1050
 
1040
1051
  if enriched_shaps is not None:
1041
1052
  self._update_shap_values(enriched_shaps)
@@ -1048,7 +1059,7 @@ class FeaturesEnricher(TransformerMixin):
1048
1059
  else:
1049
1060
  self.logger.info(f"Enriched {metric} on train combined features: {enriched_metric}")
1050
1061
  if etalon_metric is not None and enriched_metric is not None:
1051
- uplift = (enriched_metric - etalon_metric) * multiplier
1062
+ uplift = (enriched_cv_result.metric - etalon_cv_result.metric) * multiplier
1052
1063
 
1053
1064
  train_metrics = {
1054
1065
  self.bundle.get("quality_metrics_segment_header"): self.bundle.get(
@@ -1091,9 +1102,10 @@ class FeaturesEnricher(TransformerMixin):
1091
1102
  f"Calculate baseline {metric} on eval set {idx + 1} "
1092
1103
  f"on client features: {eval_X_sorted.columns.to_list()}"
1093
1104
  )
1094
- etalon_eval_metric = baseline_estimator.calculate_metric(
1105
+ etalon_eval_results = baseline_estimator.calculate_metric(
1095
1106
  eval_X_sorted, eval_y_sorted, self.baseline_score_column
1096
1107
  )
1108
+ etalon_eval_metric = etalon_eval_results.get_display_metric()
1097
1109
  self.logger.info(
1098
1110
  f"Baseline {metric} on eval set {idx + 1} client features: {etalon_eval_metric}"
1099
1111
  )
@@ -1105,9 +1117,10 @@ class FeaturesEnricher(TransformerMixin):
1105
1117
  f"Calculate enriched {metric} on eval set {idx + 1} "
1106
1118
  f"on combined features: {enriched_eval_X_sorted.columns.to_list()}"
1107
1119
  )
1108
- enriched_eval_metric = enriched_estimator.calculate_metric(
1120
+ enriched_eval_results = enriched_estimator.calculate_metric(
1109
1121
  enriched_eval_X_sorted, enriched_eval_y_sorted
1110
1122
  )
1123
+ enriched_eval_metric = enriched_eval_results.get_display_metric()
1111
1124
  self.logger.info(
1112
1125
  f"Enriched {metric} on eval set {idx + 1} combined features: {enriched_eval_metric}"
1113
1126
  )
@@ -1115,7 +1128,7 @@ class FeaturesEnricher(TransformerMixin):
1115
1128
  enriched_eval_metric = None
1116
1129
 
1117
1130
  if etalon_eval_metric is not None and enriched_eval_metric is not None:
1118
- eval_uplift = (enriched_eval_metric - etalon_eval_metric) * multiplier
1131
+ eval_uplift = (enriched_eval_results.metric - etalon_eval_results.metric) * multiplier
1119
1132
  else:
1120
1133
  eval_uplift = None
1121
1134
 
@@ -1198,9 +1211,7 @@ class FeaturesEnricher(TransformerMixin):
1198
1211
 
1199
1212
  def _update_shap_values(self, new_shaps: Dict[str, float]):
1200
1213
  new_shaps = {
1201
- feature: self._round_shap_value(shap)
1202
- for feature, shap in new_shaps.items()
1203
- if feature in self.feature_names_
1214
+ feature: _round_shap_value(shap) for feature, shap in new_shaps.items() if feature in self.feature_names_
1204
1215
  }
1205
1216
  features_importances = list(new_shaps.items())
1206
1217
  features_importances.sort(key=lambda m: (-m[1], m[0]))
@@ -1249,7 +1260,7 @@ class FeaturesEnricher(TransformerMixin):
1249
1260
  display_html_dataframe(
1250
1261
  self.relevant_data_sources,
1251
1262
  self._relevant_data_sources_wo_links,
1252
- self.bundle.get("relevant_features_header"),
1263
+ self.bundle.get("relevant_data_sources_header"),
1253
1264
  display_handle=self.data_sources_display_handle,
1254
1265
  )
1255
1266
  except (ImportError, NameError):
@@ -1437,7 +1448,12 @@ class FeaturesEnricher(TransformerMixin):
1437
1448
  client_features = [
1438
1449
  c
1439
1450
  for c in X_sampled.columns.to_list()
1440
- if c
1451
+ if (
1452
+ not self.select_features
1453
+ or c in self.feature_names_
1454
+ or (self.fit_columns_renaming is not None and self.fit_columns_renaming.get(c) in self.feature_names_)
1455
+ )
1456
+ and c
1441
1457
  not in (
1442
1458
  excluding_search_keys
1443
1459
  + list(self.fit_dropped_features)
@@ -1653,7 +1669,10 @@ class FeaturesEnricher(TransformerMixin):
1653
1669
  generated_features = []
1654
1670
  if date_column is not None:
1655
1671
  converter = DateTimeSearchKeyConverter(date_column, self.date_format, self.logger, self.bundle)
1656
- df = converter.convert(df, keep_time=True)
1672
+ # Leave original date column values
1673
+ df_with_date_features = converter.convert(df, keep_time=True)
1674
+ df_with_date_features[date_column] = df[date_column]
1675
+ df = df_with_date_features
1657
1676
  generated_features = converter.generated_features
1658
1677
 
1659
1678
  email_columns = SearchKey.find_all_keys(search_keys, SearchKey.EMAIL)
@@ -1662,9 +1681,10 @@ class FeaturesEnricher(TransformerMixin):
1662
1681
  df = generator.generate(df)
1663
1682
  generated_features.extend(generator.generated_features)
1664
1683
 
1665
- normalizer = Normalizer(self.bundle, self.logger)
1666
- df, search_keys, generated_features = normalizer.normalize(df, search_keys, generated_features)
1667
- columns_renaming = normalizer.columns_renaming
1684
+ # normalizer = Normalizer(self.bundle, self.logger)
1685
+ # df, search_keys, generated_features = normalizer.normalize(df, search_keys, generated_features)
1686
+ # columns_renaming = normalizer.columns_renaming
1687
+ columns_renaming = {c: c for c in df.columns}
1668
1688
 
1669
1689
  df, _ = clean_full_duplicates(df, logger=self.logger, bundle=self.bundle)
1670
1690
 
@@ -1980,9 +2000,19 @@ class FeaturesEnricher(TransformerMixin):
1980
2000
  file_metadata = self._search_task.get_file_metadata(str(uuid.uuid4()))
1981
2001
  search_keys = file_metadata.search_types()
1982
2002
  if SearchKey.IPV6_ADDRESS in search_keys:
1983
- search_keys.remove(SearchKey.IPV6_ADDRESS)
2003
+ # search_keys.remove(SearchKey.IPV6_ADDRESS)
2004
+ search_keys.pop(SearchKey.IPV6_ADDRESS, None)
1984
2005
 
1985
- keys = "{" + ", ".join([f'"{key.name}": "{key_example(key)}"' for key in search_keys]) + "}"
2006
+ keys = (
2007
+ "{"
2008
+ + ", ".join(
2009
+ [
2010
+ f'"{key.name}": {{"name": "{name}", "value": "{key_example(key)}"}}'
2011
+ for key, name in search_keys.items()
2012
+ ]
2013
+ )
2014
+ + "}"
2015
+ )
1986
2016
  features_for_transform = self._search_task.get_features_for_transform()
1987
2017
  if features_for_transform:
1988
2018
  original_features_for_transform = [
@@ -2026,7 +2056,10 @@ class FeaturesEnricher(TransformerMixin):
2026
2056
  start_time = time.time()
2027
2057
  with MDC(trace_id=trace_id):
2028
2058
  self.logger.info("Start transform")
2029
- self.__log_debug_information(X, exclude_features_sources=exclude_features_sources)
2059
+
2060
+ validated_X = self._validate_X(X, is_transform=True)
2061
+
2062
+ self.__log_debug_information(validated_X, exclude_features_sources=exclude_features_sources)
2030
2063
 
2031
2064
  self.__validate_search_keys(self.search_keys, self.search_id)
2032
2065
 
@@ -2058,11 +2091,11 @@ class FeaturesEnricher(TransformerMixin):
2058
2091
  self.logger.info(msg)
2059
2092
  print(msg)
2060
2093
 
2061
- validated_X = self._validate_X(X, is_transform=True)
2062
-
2063
2094
  is_demo_dataset = hash_input(validated_X) in DEMO_DATASET_HASHES
2064
2095
 
2065
- columns_to_drop = [c for c in validated_X.columns if c in self.feature_names_]
2096
+ columns_to_drop = [
2097
+ c for c in validated_X.columns if c in self.feature_names_ and c in self.dropped_client_feature_names_
2098
+ ]
2066
2099
  if len(columns_to_drop) > 0:
2067
2100
  msg = self.bundle.get("x_contains_enriching_columns").format(columns_to_drop)
2068
2101
  self.logger.warning(msg)
@@ -2091,7 +2124,7 @@ class FeaturesEnricher(TransformerMixin):
2091
2124
  date_column = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
2092
2125
  if date_column is not None:
2093
2126
  converter = DateTimeSearchKeyConverter(date_column, self.date_format, self.logger, bundle=self.bundle)
2094
- df = converter.convert(df)
2127
+ df = converter.convert(df, keep_time=True)
2095
2128
  self.logger.info(f"Date column after convertion: {df[date_column]}")
2096
2129
  generated_features.extend(converter.generated_features)
2097
2130
  else:
@@ -2186,11 +2219,12 @@ class FeaturesEnricher(TransformerMixin):
2186
2219
 
2187
2220
  if add_fit_system_record_id:
2188
2221
  df = self.__add_fit_system_record_id(df, search_keys, SYSTEM_RECORD_ID)
2189
- if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
2190
- df = df.drop(columns=DateTimeSearchKeyConverter.DATETIME_COL)
2191
2222
  df = df.rename(columns={SYSTEM_RECORD_ID: SORT_ID})
2192
2223
  features_not_to_pass.append(SORT_ID)
2193
2224
 
2225
+ if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
2226
+ df = df.drop(columns=DateTimeSearchKeyConverter.DATETIME_COL)
2227
+
2194
2228
  # search keys might be changed after explode
2195
2229
  columns_for_system_record_id = sorted(list(search_keys.keys()) + features_for_transform)
2196
2230
  df[SYSTEM_RECORD_ID] = pd.util.hash_pandas_object(df[columns_for_system_record_id], index=False).astype(
@@ -2209,7 +2243,7 @@ class FeaturesEnricher(TransformerMixin):
2209
2243
 
2210
2244
  combined_search_keys = combine_search_keys(search_keys.keys())
2211
2245
 
2212
- df_without_features = df.drop(columns=features_not_to_pass)
2246
+ df_without_features = df.drop(columns=features_not_to_pass, errors="ignore")
2213
2247
 
2214
2248
  df_without_features, full_duplicates_warning = clean_full_duplicates(
2215
2249
  df_without_features, self.logger, bundle=self.bundle
@@ -2318,11 +2352,15 @@ class FeaturesEnricher(TransformerMixin):
2318
2352
  else:
2319
2353
  result = enrich()
2320
2354
 
2355
+ selecting_columns = [
2356
+ c
2357
+ for c in itertools.chain(validated_X.columns.tolist(), generated_features)
2358
+ if c not in self.dropped_client_feature_names_
2359
+ ]
2321
2360
  filtered_columns = self.__filtered_enriched_features(importance_threshold, max_features)
2322
- existing_filtered_columns = [
2361
+ selecting_columns.extend(
2323
2362
  c for c in filtered_columns if c in result.columns and c not in validated_X.columns
2324
- ]
2325
- selecting_columns = validated_X.columns.tolist() + generated_features + existing_filtered_columns
2363
+ )
2326
2364
  if add_fit_system_record_id:
2327
2365
  selecting_columns.append(SORT_ID)
2328
2366
 
@@ -2476,9 +2514,9 @@ class FeaturesEnricher(TransformerMixin):
2476
2514
  validate_scoring_argument(scoring)
2477
2515
 
2478
2516
  self.__log_debug_information(
2479
- X,
2480
- y,
2481
- eval_set,
2517
+ validated_X,
2518
+ validated_y,
2519
+ validated_eval_set,
2482
2520
  exclude_features_sources=exclude_features_sources,
2483
2521
  calculate_metrics=calculate_metrics,
2484
2522
  scoring=scoring,
@@ -3489,15 +3527,7 @@ class FeaturesEnricher(TransformerMixin):
3489
3527
 
3490
3528
  return result_train, result_eval_sets
3491
3529
 
3492
- @staticmethod
3493
- def _round_shap_value(shap: float) -> float:
3494
- if shap > 0.0 and shap < 0.0001:
3495
- return 0.0001
3496
- else:
3497
- return round(shap, 4)
3498
-
3499
3530
  def __prepare_feature_importances(self, trace_id: str, x_columns: List[str], silent=False):
3500
- llm_source = "LLM with external data augmentation"
3501
3531
  if self._search_task is None:
3502
3532
  raise NotFittedError(self.bundle.get("transform_unfitted_enricher"))
3503
3533
  features_meta = self._search_task.get_all_features_metadata_v2()
@@ -3508,116 +3538,40 @@ class FeaturesEnricher(TransformerMixin):
3508
3538
  features_df = self._search_task.get_all_initial_raw_features(trace_id, metrics_calculation=True)
3509
3539
 
3510
3540
  self.feature_names_ = []
3541
+ self.dropped_client_feature_names_ = []
3511
3542
  self.feature_importances_ = []
3512
3543
  features_info = []
3513
3544
  features_info_without_links = []
3514
3545
  internal_features_info = []
3515
3546
 
3516
- def list_or_single(lst: List[str], single: str):
3517
- return lst or ([single] if single else [])
3518
-
3519
- def to_anchor(link: str, value: str) -> str:
3520
- if not value:
3521
- return ""
3522
- elif not link:
3523
- return value
3524
- elif value == llm_source:
3525
- return value
3526
- else:
3527
- return f"<a href='{link}' target='_blank' rel='noopener noreferrer'>{value}</a>"
3528
-
3529
- def make_links(names: List[str], links: List[str]):
3530
- all_links = [to_anchor(link, name) for name, link in itertools.zip_longest(names, links)]
3531
- return ",".join(all_links)
3532
-
3533
3547
  features_meta.sort(key=lambda m: (-m.shap_value, m.name))
3534
3548
  for feature_meta in features_meta:
3535
3549
  if feature_meta.name in original_names_dict.keys():
3536
3550
  feature_meta.name = original_names_dict[feature_meta.name]
3537
- # Use only enriched features
3551
+
3552
+ is_client_feature = feature_meta.name in x_columns
3553
+
3554
+ if feature_meta.shap_value == 0.0:
3555
+ if self.select_features:
3556
+ self.dropped_client_feature_names_.append(feature_meta.name)
3557
+ continue
3558
+
3559
+ # Use only important features
3538
3560
  if (
3539
- feature_meta.name in x_columns
3561
+ feature_meta.name in self.fit_generated_features
3540
3562
  or feature_meta.name == COUNTRY
3541
- or feature_meta.shap_value == 0.0
3542
- or feature_meta.name in self.fit_generated_features
3563
+ # In select_features mode we select also from etalon features and need to show them
3564
+ or (not self.select_features and is_client_feature)
3543
3565
  ):
3544
3566
  continue
3545
3567
 
3546
- feature_sample = []
3547
3568
  self.feature_names_.append(feature_meta.name)
3548
- self.feature_importances_.append(self._round_shap_value(feature_meta.shap_value))
3549
- if feature_meta.name in features_df.columns:
3550
- feature_sample = np.random.choice(features_df[feature_meta.name].dropna().unique(), 3).tolist()
3551
- if len(feature_sample) > 0 and isinstance(feature_sample[0], float):
3552
- feature_sample = [round(f, 4) for f in feature_sample]
3553
- feature_sample = [str(f) for f in feature_sample]
3554
- feature_sample = ", ".join(feature_sample)
3555
- if len(feature_sample) > 30:
3556
- feature_sample = feature_sample[:30] + "..."
3557
-
3558
- internal_provider = feature_meta.data_provider or "Upgini"
3559
- providers = list_or_single(feature_meta.data_providers, feature_meta.data_provider)
3560
- provider_links = list_or_single(feature_meta.data_provider_links, feature_meta.data_provider_link)
3561
- if providers:
3562
- provider = make_links(providers, provider_links)
3563
- else:
3564
- provider = to_anchor("https://upgini.com", "Upgini")
3565
-
3566
- internal_source = feature_meta.data_source or (
3567
- llm_source
3568
- if not feature_meta.name.endswith("_country") and not feature_meta.name.endswith("_postal_code")
3569
- else ""
3570
- )
3571
- sources = list_or_single(feature_meta.data_sources, feature_meta.data_source)
3572
- source_links = list_or_single(feature_meta.data_source_links, feature_meta.data_source_link)
3573
- if sources:
3574
- source = make_links(sources, source_links)
3575
- else:
3576
- source = internal_source
3569
+ self.feature_importances_.append(_round_shap_value(feature_meta.shap_value))
3577
3570
 
3578
- internal_feature_name = feature_meta.name
3579
- if feature_meta.doc_link:
3580
- feature_name = to_anchor(feature_meta.doc_link, feature_meta.name)
3581
- else:
3582
- feature_name = internal_feature_name
3583
-
3584
- features_info.append(
3585
- {
3586
- self.bundle.get("features_info_name"): feature_name,
3587
- self.bundle.get("features_info_shap"): self._round_shap_value(feature_meta.shap_value),
3588
- self.bundle.get("features_info_hitrate"): feature_meta.hit_rate,
3589
- self.bundle.get("features_info_value_preview"): feature_sample,
3590
- self.bundle.get("features_info_provider"): provider,
3591
- self.bundle.get("features_info_source"): source,
3592
- self.bundle.get("features_info_update_frequency"): feature_meta.update_frequency,
3593
- }
3594
- )
3595
- features_info_without_links.append(
3596
- {
3597
- self.bundle.get("features_info_name"): internal_feature_name,
3598
- self.bundle.get("features_info_shap"): self._round_shap_value(feature_meta.shap_value),
3599
- self.bundle.get("features_info_hitrate"): feature_meta.hit_rate,
3600
- self.bundle.get("features_info_value_preview"): feature_sample,
3601
- self.bundle.get("features_info_provider"): internal_provider,
3602
- self.bundle.get("features_info_source"): internal_source,
3603
- self.bundle.get("features_info_update_frequency"): feature_meta.update_frequency,
3604
- }
3605
- )
3606
- internal_features_info.append(
3607
- {
3608
- self.bundle.get("features_info_name"): internal_feature_name,
3609
- "feature_link": feature_meta.doc_link,
3610
- self.bundle.get("features_info_shap"): self._round_shap_value(feature_meta.shap_value),
3611
- self.bundle.get("features_info_hitrate"): feature_meta.hit_rate,
3612
- self.bundle.get("features_info_value_preview"): feature_sample,
3613
- self.bundle.get("features_info_provider"): internal_provider,
3614
- "provider_link": feature_meta.data_provider_link,
3615
- self.bundle.get("features_info_source"): internal_source,
3616
- "source_link": feature_meta.data_source_link,
3617
- self.bundle.get("features_info_commercial_schema"): feature_meta.commercial_schema or "",
3618
- self.bundle.get("features_info_update_frequency"): feature_meta.update_frequency,
3619
- }
3620
- )
3571
+ feature_info = FeatureInfo.from_metadata(feature_meta, features_df, is_client_feature)
3572
+ features_info.append(feature_info.to_row(self.bundle))
3573
+ features_info_without_links.append(feature_info.to_row_without_links(self.bundle))
3574
+ internal_features_info.append(feature_info.to_internal_row(self.bundle))
3621
3575
 
3622
3576
  if len(features_info) > 0:
3623
3577
  self.features_info = pd.DataFrame(features_info)
@@ -3642,7 +3596,22 @@ class FeaturesEnricher(TransformerMixin):
3642
3596
  autofe_meta = self._search_task.get_autofe_metadata()
3643
3597
  if autofe_meta is None:
3644
3598
  return None
3645
- features_meta = self._search_task.get_all_features_metadata_v2()
3599
+ if len(self._internal_features_info) != 0:
3600
+
3601
+ def to_feature_meta(row):
3602
+ fm = FeaturesMetadataV2(
3603
+ name=row[bundle.get("features_info_name")],
3604
+ type="",
3605
+ source="",
3606
+ hit_rate=bundle.get("features_info_hitrate"),
3607
+ shap_value=bundle.get("features_info_shap"),
3608
+ data_source=bundle.get("features_info_source"),
3609
+ )
3610
+ return fm
3611
+
3612
+ features_meta = self._internal_features_info.apply(to_feature_meta).to_list()
3613
+ else:
3614
+ features_meta = self._search_task.get_all_features_metadata_v2()
3646
3615
 
3647
3616
  def get_feature_by_name(name: str):
3648
3617
  for m in features_meta:
@@ -3762,11 +3731,17 @@ class FeaturesEnricher(TransformerMixin):
3762
3731
  if len(passed_unsupported_search_keys) > 0:
3763
3732
  raise ValidationError(self.bundle.get("unsupported_search_key").format(passed_unsupported_search_keys))
3764
3733
 
3734
+ x_columns = [
3735
+ c
3736
+ for c in x.columns
3737
+ if c not in [TARGET, EVAL_SET_INDEX, SYSTEM_RECORD_ID, ENTITY_SYSTEM_RECORD_ID, SEARCH_KEY_UNNEST]
3738
+ ]
3739
+
3765
3740
  for column_id, meaning_type in search_keys.items():
3766
3741
  column_name = None
3767
3742
  if isinstance(column_id, str):
3768
3743
  if column_id not in x.columns:
3769
- raise ValidationError(self.bundle.get("search_key_not_found").format(column_id, list(x.columns)))
3744
+ raise ValidationError(self.bundle.get("search_key_not_found").format(column_id, x_columns))
3770
3745
  column_name = column_id
3771
3746
  valid_search_keys[column_name] = meaning_type
3772
3747
  elif isinstance(column_id, int):
upgini/http.py CHANGED
@@ -882,7 +882,7 @@ class _RestClient:
882
882
  if content_type:
883
883
  headers[_RestClient.CONTENT_TYPE_HEADER_NAME] = content_type
884
884
  if trace_id:
885
- headers[_RestClient.TRACE_ID_HEADER_NAME] = trace_id
885
+ headers[_RestClient.TRACE_ID_HEADER_NAME] = str(trace_id)
886
886
  for header_key, header_value in additional_headers.items():
887
887
  headers[header_key] = header_value
888
888
  return headers
upgini/metadata.py CHANGED
@@ -1,7 +1,7 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  from enum import Enum
4
- from typing import Any, Dict, List, Optional, Set, Union
4
+ from typing import Any, Dict, List, Optional, Union
5
5
 
6
6
  from pydantic import BaseModel
7
7
 
@@ -228,13 +228,13 @@ class FileMetadata(BaseModel):
228
228
  return c
229
229
  return None
230
230
 
231
- def search_types(self) -> Set[SearchKey]:
232
- search_keys = set()
231
+ def search_types(self) -> Dict[SearchKey, str]:
232
+ search_keys = dict()
233
233
  for keys_group in self.searchKeys:
234
234
  for key in keys_group:
235
235
  column = self.column_by_name(key)
236
236
  if column:
237
- search_keys.add(SearchKey.from_meaning_type(column.meaningType))
237
+ search_keys[SearchKey.from_meaning_type(column.meaningType)] = column.name
238
238
  return search_keys
239
239
 
240
240
 
upgini/metrics.py CHANGED
@@ -1,5 +1,6 @@
1
1
  from __future__ import annotations
2
2
 
3
+ from dataclasses import dataclass
3
4
  import inspect
4
5
  import logging
5
6
  import re
@@ -210,6 +211,21 @@ SUPPORTED_CATBOOST_METRICS = {
210
211
  }
211
212
 
212
213
 
214
+ @dataclass
215
+ class _CrossValResults:
216
+ metric: Optional[float]
217
+ metric_std: Optional[float]
218
+ shap_values: Optional[Dict[str, float]]
219
+
220
+ def get_display_metric(self) -> Optional[str]:
221
+ if self.metric is None:
222
+ return None
223
+ elif self.metric_std is None:
224
+ return f"{self.metric:.3f}"
225
+ else:
226
+ return f"{self.metric:.3f} ± {self.metric_std:.3f}"
227
+
228
+
213
229
  class EstimatorWrapper:
214
230
  def __init__(
215
231
  self,
@@ -297,11 +313,11 @@ class EstimatorWrapper:
297
313
 
298
314
  def cross_val_predict(
299
315
  self, x: pd.DataFrame, y: np.ndarray, baseline_score_column: Optional[Any] = None
300
- ) -> Tuple[Optional[float], Optional[Dict[str, float]]]:
316
+ ) -> _CrossValResults:
301
317
  x, y, groups, fit_params = self._prepare_to_fit(x, y)
302
318
 
303
319
  if x.shape[1] == 0:
304
- return None
320
+ return _CrossValResults(metric=None, metric_std=None, shap_values=None)
305
321
 
306
322
  scorer = check_scoring(self.estimator, scoring=self.scorer)
307
323
 
@@ -326,7 +342,7 @@ class EstimatorWrapper:
326
342
 
327
343
  self.check_fold_metrics(metrics_by_fold)
328
344
 
329
- metric = np.mean(metrics_by_fold) * self.multiplier
345
+ metric, metric_std = self._calculate_metric_from_folds(metrics_by_fold)
330
346
 
331
347
  splits = self.cv.split(x, y, groups)
332
348
 
@@ -351,7 +367,7 @@ class EstimatorWrapper:
351
367
  else:
352
368
  average_shap_values = None
353
369
 
354
- return self.post_process_metric(metric), average_shap_values
370
+ return _CrossValResults(metric=metric, metric_std=metric_std, shap_values=average_shap_values)
355
371
 
356
372
  def process_shap_values(self, shap_values: Dict[str, float]) -> Dict[str, float]:
357
373
  return shap_values
@@ -367,17 +383,25 @@ class EstimatorWrapper:
367
383
  metric = 2 * metric - 1
368
384
  return metric
369
385
 
370
- def calculate_metric(self, x: pd.DataFrame, y: np.ndarray, baseline_score_column: Optional[Any] = None) -> float:
386
+ def calculate_metric(
387
+ self, x: pd.DataFrame, y: np.ndarray, baseline_score_column: Optional[Any] = None
388
+ ) -> _CrossValResults:
371
389
  x, y, _ = self._prepare_to_calculate(x, y)
372
390
  if baseline_score_column is not None and self.metric_name == "GINI":
373
- metric = roc_auc_score(y, x[baseline_score_column])
391
+ metric, metric_std = roc_auc_score(y, x[baseline_score_column]), None
374
392
  else:
375
393
  metrics = []
376
394
  for est in self.cv_estimators:
377
395
  metrics.append(self.scorer(est, x, y))
378
396
 
379
- metric = np.mean(metrics) * self.multiplier
380
- return self.post_process_metric(metric)
397
+ metric, metric_std = self._calculate_metric_from_folds(metrics)
398
+ return _CrossValResults(metric=metric, metric_std=metric_std, shap_values=None)
399
+
400
+ def _calculate_metric_from_folds(self, metrics_by_fold: List[float]) -> Tuple[float, float]:
401
+ metrics_by_fold = [self.post_process_metric(m) for m in metrics_by_fold]
402
+ metric = np.mean(metrics_by_fold) * self.multiplier
403
+ metric_std = np.std(metrics_by_fold) * np.abs(self.multiplier)
404
+ return metric, metric_std
381
405
 
382
406
  @staticmethod
383
407
  def create(
@@ -591,7 +615,7 @@ class CatBoostWrapper(EstimatorWrapper):
591
615
 
592
616
  def cross_val_predict(
593
617
  self, x: pd.DataFrame, y: np.ndarray, baseline_score_column: Optional[Any] = None
594
- ) -> Tuple[Optional[float], Optional[Dict[str, float]]]:
618
+ ) -> _CrossValResults:
595
619
  try:
596
620
  return super().cross_val_predict(x, y, baseline_score_column)
597
621
  except Exception as e:
@@ -82,7 +82,7 @@ unregistered_only_personal_keys=Only personal search keys used. Api_key from pro
82
82
  search_key_not_found=Column `{}` from search_keys was not found in X dataframe: {}
83
83
  numeric_search_key_not_found=Index {} in search_keys is out of bounds for {} columns of X dataframe
84
84
  unsupported_search_key_type=Unsupported type of key in search_keys: {}
85
- unsupported_type_of_search_key=Unsupported type of search key: {}. It should be a member of SearcKey
85
+ unsupported_type_of_search_key=Unsupported type of search key: {}. It should be a member of SearchKey
86
86
  search_key_country_and_country_code=SearchKey.COUNTRY and country_code parameter were passed simultaniously. Parameter country_code will be ignored
87
87
  empty_search_key=Search key {} is empty. Please fill values or remove this search key
88
88
  single_constant_search_key=Constant value detected for the {} search key in the X dataframe: {}.\nThat search key will add constant features for different y values.\nPlease add extra search keys with non constant values, like the COUNTRY, POSTAL_CODE, DATE, PHONE NUMBER, EMAIL/HEM or IPv4
@@ -201,7 +201,7 @@ email_detected=Emails detected in column `{}`. It will be used as a search key\n
201
201
  email_detected_not_registered=Emails detected in column `{}`. It can be used only with api_key from profile.upgini.com\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns
202
202
  phone_detected=Phone numbers detected in column `{}`. It can be used only with api_key from profile.upgini.com\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns
203
203
  phone_detected_not_registered=Phone numbers detected in column `{}`. It can be used only with api_key from profile.upgini.com\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns
204
- target_type_detected=Detected task type: {}. Reason: {}\nYou can set task type manually with argument `model_task_type` of FeaturesEnricher constructor if task type detected incorrectly\n
204
+ target_type_detected=\nDetected task type: {}. Reason: {}\nYou can set task type manually with argument `model_task_type` of FeaturesEnricher constructor if task type detected incorrectly\n
205
205
  binary_target_reason=only two unique label-values observed
206
206
  non_numeric_multiclass_reason=non-numeric label values observed
207
207
  few_unique_label_multiclass_reason=few unique label-values observed and can be considered as categorical
@@ -212,7 +212,7 @@ limited_int_multiclass_reason=integer-like values with limited unique values obs
212
212
  all_ok_community_invite=❓ Support request
213
213
  too_small_for_metrics=Your train dataset or one of eval datasets contains less than 500 rows. For such dataset Upgini will not calculate accuracy metrics. Please increase the number of rows in the training dataset to calculate accuracy metrics
214
214
  imbalance_multiclass=Class {0} is on 25% quantile of classes distribution ({1} records in train dataset). \nDownsample classes with records more than {1}.
215
- imbalanced_target=Target is imbalanced and will be undersampled. Frequency of the rarest class `{}` is {}
215
+ imbalanced_target=\nTarget is imbalanced and will be undersampled. Frequency of the rarest class `{}` is {}
216
216
  loss_selection_info=Using loss `{}` for feature selection
217
217
  loss_calc_metrics_info=Using loss `{}` for metrics calculation with default estimator
218
218
 
@@ -109,20 +109,63 @@ class DateTimeSearchKeyConverter:
109
109
 
110
110
  df = self.clean_old_dates(df)
111
111
 
112
+ # Define function to apply sine and cosine transformations
113
+ def add_cyclical_features(df, column, period):
114
+ period_suffix = f"_{period}" if column != "day_in_quarter" else ""
115
+ sin_feature = f"datetime_{column}_sin{period_suffix}"
116
+ cos_feature = f"datetime_{column}_cos{period_suffix}"
117
+ if sin_feature not in df.columns:
118
+ df[sin_feature] = np.sin(2 * np.pi * df[column] / period)
119
+ self.generated_features.append(sin_feature)
120
+ if cos_feature not in df.columns:
121
+ df[cos_feature] = np.cos(2 * np.pi * df[column] / period)
122
+ self.generated_features.append(cos_feature)
123
+
124
+ df["quarter"] = df[self.date_column].dt.quarter
125
+
126
+ # Calculate the start date of the quarter for each timestamp
127
+ df["quarter_start"] = df[self.date_column].dt.to_period("Q").dt.start_time
128
+
129
+ # Calculate the day in the quarter
130
+ df["day_in_quarter"] = (df[self.date_column] - df["quarter_start"]).dt.days + 1
131
+
132
+ # Vectorized calculation of days_in_quarter
133
+ quarter = df["quarter"]
134
+ start = df["quarter_start"]
135
+ year = start.dt.year
136
+ month = start.dt.month
137
+
138
+ quarter_end_year = np.where(quarter == 4, year + 1, year)
139
+ quarter_end_month = np.where(quarter == 4, 1, month + 3)
140
+
141
+ end = pd.to_datetime({"year": quarter_end_year, "month": quarter_end_month, "day": 1})
142
+ end.index = df.index
143
+
144
+ df["days_in_quarter"] = (end - start).dt.days
145
+
146
+ add_cyclical_features(df, "day_in_quarter", df["days_in_quarter"]) # Days in the quarter
147
+
148
+ df.drop(columns=["quarter", "quarter_start", "day_in_quarter", "days_in_quarter"], inplace=True)
149
+
112
150
  df[seconds] = (df[self.date_column] - df[self.date_column].dt.floor("D")).dt.seconds
113
151
 
114
152
  seconds_without_na = df[seconds].dropna()
115
153
  if (seconds_without_na != 0).any() and seconds_without_na.nunique() > 1:
116
154
  self.logger.info("Time found in date search key. Add extra features based on time")
117
- seconds_in_day = 60 * 60 * 24
118
- orders = [1, 2, 24, 48]
119
- for order in orders:
120
- sin_feature = f"datetime_time_sin_{order}"
121
- cos_feature = f"datetime_time_cos_{order}"
122
- df[sin_feature] = np.round(np.sin(2 * np.pi * order * df[seconds] / seconds_in_day), 10)
123
- df[cos_feature] = np.round(np.cos(2 * np.pi * order * df[seconds] / seconds_in_day), 10)
124
- self.generated_features.append(sin_feature)
125
- self.generated_features.append(cos_feature)
155
+
156
+ # Extract basic components
157
+ df["second"] = df[self.date_column].dt.second
158
+ df["minute"] = df[self.date_column].dt.minute
159
+ df["hour"] = df[self.date_column].dt.hour
160
+
161
+ # Apply cyclical transformations
162
+ add_cyclical_features(df, "second", 60) # Seconds in a minute
163
+ add_cyclical_features(df, "minute", 60) # Minutes in an hour
164
+ add_cyclical_features(df, "minute", 30) # Minutes in half an hour
165
+ add_cyclical_features(df, "hour", 24) # Hours in a day
166
+
167
+ # Drop intermediate columns if not needed
168
+ df.drop(columns=["second", "minute", "hour"], inplace=True)
126
169
 
127
170
  df.drop(columns=seconds, inplace=True)
128
171
 
@@ -38,8 +38,9 @@ class EmailDomainGenerator:
38
38
  def generate(self, df: pd.DataFrame) -> pd.DataFrame:
39
39
  for email_col in self.email_columns:
40
40
  domain_feature = email_col + self.DOMAIN_SUFFIX
41
- df[domain_feature] = df[email_col].apply(self._email_to_domain)
42
- self.generated_features.append(domain_feature)
41
+ if domain_feature not in df.columns:
42
+ df[domain_feature] = df[email_col].apply(self._email_to_domain).astype("string")
43
+ self.generated_features.append(domain_feature)
43
44
  return df
44
45
 
45
46
  @staticmethod
@@ -0,0 +1,172 @@
1
+ from dataclasses import dataclass
2
+ import itertools
3
+ from typing import Dict, List
4
+
5
+ import numpy as np
6
+ import pandas as pd
7
+
8
+ from upgini.metadata import FeaturesMetadataV2
9
+ from upgini.resource_bundle import ResourceBundle
10
+
11
+
12
+ LLM_SOURCE = "LLM with external data augmentation"
13
+
14
+
15
+ @dataclass
16
+ class FeatureInfo:
17
+ name: str
18
+ internal_name: str
19
+ rounded_shap: float
20
+ hitrate: float
21
+ value_preview: str
22
+ provider: str
23
+ internal_provider: str
24
+ source: str
25
+ internal_source: str
26
+ update_frequency: str
27
+ commercial_schema: str
28
+ doc_link: str
29
+ data_provider_link: str
30
+ data_source_link: str
31
+
32
+ @staticmethod
33
+ def from_metadata(feature_meta: FeaturesMetadataV2, data: pd.DataFrame, is_client_feature: bool) -> "FeatureInfo":
34
+ return FeatureInfo(
35
+ name=_get_name(feature_meta),
36
+ internal_name=_get_internal_name(feature_meta),
37
+ rounded_shap=_round_shap_value(feature_meta.shap_value),
38
+ hitrate=feature_meta.hit_rate,
39
+ value_preview=_get_feature_sample(feature_meta, data),
40
+ provider=_get_provider(feature_meta, is_client_feature),
41
+ internal_provider=_get_internal_provider(feature_meta, is_client_feature),
42
+ source=_get_source(feature_meta, is_client_feature),
43
+ internal_source=_get_internal_source(feature_meta, is_client_feature),
44
+ update_frequency=feature_meta.update_frequency,
45
+ commercial_schema=feature_meta.commercial_schema,
46
+ doc_link=feature_meta.doc_link,
47
+ data_provider_link=feature_meta.data_provider_link,
48
+ data_source_link=feature_meta.data_source_link,
49
+ )
50
+
51
+ def to_row(self, bundle: ResourceBundle) -> Dict[str, str]:
52
+ return {
53
+ bundle.get("features_info_name"): self.name,
54
+ bundle.get("features_info_shap"): self.rounded_shap,
55
+ bundle.get("features_info_hitrate"): self.hitrate,
56
+ bundle.get("features_info_value_preview"): self.value_preview,
57
+ bundle.get("features_info_provider"): self.provider,
58
+ bundle.get("features_info_source"): self.source,
59
+ bundle.get("features_info_update_frequency"): self.update_frequency,
60
+ }
61
+
62
+ def to_row_without_links(self, bundle: ResourceBundle) -> Dict[str, str]:
63
+ return {
64
+ bundle.get("features_info_name"): self.internal_name,
65
+ bundle.get("features_info_shap"): self.rounded_shap,
66
+ bundle.get("features_info_hitrate"): self.hitrate,
67
+ bundle.get("features_info_value_preview"): self.value_preview,
68
+ bundle.get("features_info_provider"): self.internal_provider,
69
+ bundle.get("features_info_source"): self.internal_source,
70
+ bundle.get("features_info_update_frequency"): self.update_frequency,
71
+ }
72
+
73
+ def to_internal_row(self, bundle: ResourceBundle) -> Dict[str, str]:
74
+ return {
75
+ bundle.get("features_info_name"): self.internal_name,
76
+ "feature_link": self.doc_link,
77
+ bundle.get("features_info_shap"): self.rounded_shap,
78
+ bundle.get("features_info_hitrate"): self.hitrate,
79
+ bundle.get("features_info_value_preview"): self.value_preview,
80
+ bundle.get("features_info_provider"): self.internal_provider,
81
+ "provider_link": self.data_provider_link,
82
+ bundle.get("features_info_source"): self.internal_source,
83
+ "source_link": self.data_source_link,
84
+ bundle.get("features_info_commercial_schema"): self.commercial_schema or "",
85
+ bundle.get("features_info_update_frequency"): self.update_frequency,
86
+ }
87
+
88
+
89
+ def _get_feature_sample(feature_meta: FeaturesMetadataV2, data: pd.DataFrame) -> str:
90
+ if feature_meta.name in data.columns:
91
+ feature_sample = np.random.choice(data[feature_meta.name].dropna().unique(), 3).tolist()
92
+ if len(feature_sample) > 0 and isinstance(feature_sample[0], float):
93
+ feature_sample = [round(f, 4) for f in feature_sample]
94
+ feature_sample = [str(f) for f in feature_sample]
95
+ feature_sample = ", ".join(feature_sample)
96
+ if len(feature_sample) > 30:
97
+ feature_sample = feature_sample[:30] + "..."
98
+ else:
99
+ feature_sample = ""
100
+ return feature_sample
101
+
102
+
103
+ def _get_name(feature_meta: FeaturesMetadataV2) -> str:
104
+ if feature_meta.doc_link:
105
+ return _to_anchor(feature_meta.doc_link, feature_meta.name)
106
+ else:
107
+ return feature_meta.name
108
+
109
+
110
+ def _get_internal_name(feature_meta: FeaturesMetadataV2) -> str:
111
+ return feature_meta.name
112
+
113
+
114
+ def _get_provider(feature_meta: FeaturesMetadataV2, is_client_feature: bool) -> str:
115
+ providers = _list_or_single(feature_meta.data_providers, feature_meta.data_provider)
116
+ provider_links = _list_or_single(feature_meta.data_provider_links, feature_meta.data_provider_link)
117
+ if providers:
118
+ provider = _make_links(providers, provider_links)
119
+ else:
120
+ provider = "" if is_client_feature else _to_anchor("https://upgini.com", "Upgini")
121
+ return provider
122
+
123
+
124
+ def _get_internal_provider(feature_meta: FeaturesMetadataV2, is_client_feature: bool) -> str:
125
+ return "" if is_client_feature else (feature_meta.data_provider or "Upgini")
126
+
127
+
128
+ def _get_source(feature_meta: FeaturesMetadataV2, is_client_feature: bool) -> str:
129
+ sources = _list_or_single(feature_meta.data_sources, feature_meta.data_source)
130
+ source_links = _list_or_single(feature_meta.data_source_links, feature_meta.data_source_link)
131
+ if sources:
132
+ source = _make_links(sources, source_links)
133
+ else:
134
+ source = _get_internal_source(feature_meta, is_client_feature)
135
+ return source
136
+
137
+
138
+ def _get_internal_source(feature_meta: FeaturesMetadataV2, is_client_feature: bool) -> str:
139
+ return feature_meta.data_source or (
140
+ LLM_SOURCE
141
+ if not feature_meta.name.endswith("_country")
142
+ and not feature_meta.name.endswith("_postal_code")
143
+ and not is_client_feature
144
+ else ""
145
+ )
146
+
147
+
148
+ def _list_or_single(lst: List[str], single: str):
149
+ return lst or ([single] if single else [])
150
+
151
+
152
+ def _to_anchor(link: str, value: str) -> str:
153
+ if not value:
154
+ return ""
155
+ elif not link:
156
+ return value
157
+ elif value == LLM_SOURCE:
158
+ return value
159
+ else:
160
+ return f"<a href='{link}' target='_blank' rel='noopener noreferrer'>{value}</a>"
161
+
162
+
163
+ def _make_links(names: List[str], links: List[str]):
164
+ all_links = [_to_anchor(link, name) for name, link in itertools.zip_longest(names, links)]
165
+ return ",".join(all_links)
166
+
167
+
168
+ def _round_shap_value(shap: float) -> float:
169
+ if shap > 0.0 and shap < 0.0001:
170
+ return 0.0001
171
+ else:
172
+ return round(shap, 4)
@@ -2,6 +2,7 @@ import logging
2
2
  from logging import Logger
3
3
  from typing import Dict, List, Optional, Tuple
4
4
 
5
+ import numpy as np
5
6
  import pandas as pd
6
7
  from pandas.api.types import is_integer_dtype, is_object_dtype, is_string_dtype
7
8
 
@@ -83,10 +84,21 @@ class FeaturesValidator:
83
84
  return [
84
85
  i
85
86
  for i in df
86
- if (is_object_dtype(df[i]) or is_string_dtype(df[i]) or is_integer_dtype(df[i]))
87
+ if (is_object_dtype(df[i]) or is_string_dtype(df[i]) or FeaturesValidator.__is_integer(df[i]))
87
88
  and (df[i].nunique(dropna=False) / row_count >= 0.85)
88
89
  ]
89
90
 
91
+ @staticmethod
92
+ def __is_integer(series: pd.Series) -> bool:
93
+ return (
94
+ is_integer_dtype(series)
95
+ or series.dropna()
96
+ .apply(
97
+ lambda f: (float.is_integer(f) and abs(f) < np.iinfo(np.int64).max) if isinstance(f, float) else False
98
+ )
99
+ .all()
100
+ )
101
+
90
102
  @staticmethod
91
103
  def find_constant_features(df: pd.DataFrame) -> List[str]:
92
104
  return [i for i in df if df[i].nunique() <= 1]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: upgini
3
- Version: 1.2.27
3
+ Version: 1.2.29
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -1,13 +1,13 @@
1
- upgini/__about__.py,sha256=JKArgvnX6ljUI_WxYnXTejXGdjsA4KJ3Cy2xBcK4vh4,23
1
+ upgini/__about__.py,sha256=kkc0PkaP1QFrTEPI8N5OtZ0p2wkfpteTOMyPLNGAXgk,23
2
2
  upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
3
3
  upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
4
4
  upgini/dataset.py,sha256=iPFiMJtk4HF1ytw9wCQr8H9RfoOKj_TIo8XYZKWgcMc,31331
5
5
  upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
6
- upgini/features_enricher.py,sha256=7xYxcLvxQgDX7vE2gWEbBPceAVeEgBVpu9xtBJvXpoQ,194078
7
- upgini/http.py,sha256=21asexflvavydzCOONJDGQBtQanCElrbnqLXakJ9Cu8,42880
6
+ upgini/features_enricher.py,sha256=zcIEKzgiUX46KdtWlMl-15Dz32shXVgscvsQkULusoU,192228
7
+ upgini/http.py,sha256=plZGTGoi1h2edd8Cnjt4eYB8t4NbBGnZz7DtPTByiNc,42885
8
8
  upgini/lazy_import.py,sha256=74gQ8JuA48BGRLxAo7lNHNKY2D2emMxrUxKGdxVGhuY,1012
9
- upgini/metadata.py,sha256=osmzdNESeh7yP3BZday6N9Q3eaIHfzhhRM1d6NSgcf0,11223
10
- upgini/metrics.py,sha256=PoY1fq6XYAHNzn-rmnwRQZjCoVYP5bJNmKhR0ST2Txk,34588
9
+ upgini/metadata.py,sha256=lUa2xYhBhnCeTqNt6lWc9iP_YuikYGIsDSn8Vwyjv1I,11235
10
+ upgini/metrics.py,sha256=hr7UwLphbZ_FEglLuO2lzr_pFgxOJ4c3WBeg7H-fNqY,35521
11
11
  upgini/search_task.py,sha256=qxUxAD-bed-FpZYmTB_4orW7YJsW_O6a1TcgnZIRFr4,17307
12
12
  upgini/spinner.py,sha256=4iMd-eIe_BnkqFEMIliULTbj6rNI2HkN_VJ4qYe0cUc,1118
13
13
  upgini/version_validator.py,sha256=h1GViOWzULy5vf6M4dpTJuIk-4V38UCrTY1sb9yLa5I,1594
@@ -30,7 +30,7 @@ upgini/normalizer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU
30
30
  upgini/normalizer/normalize_utils.py,sha256=Ft2MwSgVoBilXAORAOYAuwPD79GOLfwn4qQE3IUFzzg,7218
31
31
  upgini/resource_bundle/__init__.py,sha256=S5F2G47pnJd2LDpmFsjDqEwiKkP8Hm-hcseDbMka6Ko,8345
32
32
  upgini/resource_bundle/exceptions.py,sha256=5fRvx0_vWdE1-7HcSgF0tckB4A9AKyf5RiinZkInTsI,621
33
- upgini/resource_bundle/strings.properties,sha256=l3yg9H17NwCwvfZQyOYTvXbPP6mwdXH_CGlqyxOQVFY,26669
33
+ upgini/resource_bundle/strings.properties,sha256=fOAeLTsnx8xvJK-7RPFXprATG0n56jeCdse8sQTuVX8,26674
34
34
  upgini/resource_bundle/strings_widget.properties,sha256=gOdqvZWntP2LCza_tyVk1_yRYcG4c04K9sQOAVhF_gw,1577
35
35
  upgini/sampler/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
36
36
  upgini/sampler/base.py,sha256=7GpjYqjOp58vYcJLiX__1R5wjUlyQbxvHJ2klFnup_M,6389
@@ -43,12 +43,13 @@ upgini/utils/blocked_time_series.py,sha256=Uqr3vp4YqNclj2-PzEYqVy763GSXHn86sbpIl
43
43
  upgini/utils/country_utils.py,sha256=lY-eXWwFVegdVENFttbvLcgGDjFO17Sex8hd2PyJaRk,6937
44
44
  upgini/utils/custom_loss_utils.py,sha256=kieNZYBYZm5ZGBltF1F_jOSF4ea6C29rYuCyiDcqVNY,3857
45
45
  upgini/utils/cv_utils.py,sha256=w6FQb9nO8BWDx88EF83NpjPLarK4eR4ia0Wg0kLBJC4,3525
46
- upgini/utils/datetime_utils.py,sha256=a8X4jX2y3-6E7ZNZIG5z61qfzCvsvaNEjR1Bi5KUqfM,11279
46
+ upgini/utils/datetime_utils.py,sha256=F61i2vZCB6eUy4WwodDyPi50XKPbhOHsxDrU6tGa6CM,13133
47
47
  upgini/utils/deduplicate_utils.py,sha256=SMZx9IKIhWI5HqXepfKiQb3uDJrogQZtG6jcWuMo5Z4,8855
48
48
  upgini/utils/display_utils.py,sha256=DsBjJ8jEYAh8BPgfAbzq5imoGFV6IACP20PQ78BQCX0,11964
49
- upgini/utils/email_utils.py,sha256=j0Ug1R_0AnCg1Y92zIZ4XMwvKo3G5_pcOlBN1OH_gZs,5191
49
+ upgini/utils/email_utils.py,sha256=GbnhHJn1nhUBytmK6PophYqaoq4t7Lp6i0-O0Gd3RV8,5265
50
50
  upgini/utils/fallback_progress_bar.py,sha256=PDaKb8dYpVZaWMroNcOHsTc3pSjgi9mOm0--cOFTwJ0,1074
51
- upgini/utils/features_validator.py,sha256=1Xj2ir5LzzYiX3NH8o88c2J6RTTetaTwu0MhjLTyuvM,3378
51
+ upgini/utils/feature_info.py,sha256=Tp_2g5-rCjY4NpzKhzxwNxuqH5FFL8vG94OU5kH6wzk,6702
52
+ upgini/utils/features_validator.py,sha256=lEfmk4DoxZ4ooOE1HC0ZXtUb_lFKRFHIrnFULZ4_rL8,3746
52
53
  upgini/utils/format.py,sha256=Yv5cvvSs2bOLUzzNu96Pu33VMDNbabio92QepUj41jU,243
53
54
  upgini/utils/ip_utils.py,sha256=Q6vb7Sr5Khx3Sq3eENjW2qCXKej_S5jZbneH6zEOkzQ,5171
54
55
  upgini/utils/phone_utils.py,sha256=IrbztLuOJBiePqqxllfABWfYlfAjYevPhXKipl95wUI,10432
@@ -58,7 +59,7 @@ upgini/utils/sklearn_ext.py,sha256=13jQS_k7v0aUtudXV6nGUEWjttPQzAW9AFYL5wgEz9k,4
58
59
  upgini/utils/target_utils.py,sha256=PU77nIhTz7IHbC4rpTpxrVxib6cdpRL9F1dhkjIffLY,10225
59
60
  upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
60
61
  upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
61
- upgini-1.2.27.dist-info/METADATA,sha256=iSB1iB7EwBugIUf8DYOz9mEFqGewDAS49-hgYuhtrtU,48578
62
- upgini-1.2.27.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
63
- upgini-1.2.27.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
64
- upgini-1.2.27.dist-info/RECORD,,
62
+ upgini-1.2.29.dist-info/METADATA,sha256=i3crJ_plUCfgF91rGJXv_slwHAANMRUps6kRrNjIIso,48578
63
+ upgini-1.2.29.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
64
+ upgini-1.2.29.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
65
+ upgini-1.2.29.dist-info/RECORD,,