upgini 1.2.28__tar.gz → 1.2.29__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

Files changed (67) hide show
  1. {upgini-1.2.28 → upgini-1.2.29}/PKG-INFO +1 -1
  2. upgini-1.2.29/src/upgini/__about__.py +1 -0
  3. {upgini-1.2.28 → upgini-1.2.29}/src/upgini/features_enricher.py +98 -130
  4. {upgini-1.2.28 → upgini-1.2.29}/src/upgini/http.py +1 -1
  5. {upgini-1.2.28 → upgini-1.2.29}/src/upgini/metadata.py +4 -4
  6. {upgini-1.2.28 → upgini-1.2.29}/src/upgini/metrics.py +33 -9
  7. {upgini-1.2.28 → upgini-1.2.29}/src/upgini/resource_bundle/strings.properties +1 -1
  8. {upgini-1.2.28 → upgini-1.2.29}/src/upgini/utils/datetime_utils.py +52 -9
  9. {upgini-1.2.28 → upgini-1.2.29}/src/upgini/utils/email_utils.py +3 -2
  10. upgini-1.2.29/src/upgini/utils/feature_info.py +172 -0
  11. {upgini-1.2.28 → upgini-1.2.29}/src/upgini/utils/features_validator.py +13 -1
  12. upgini-1.2.28/src/upgini/__about__.py +0 -1
  13. {upgini-1.2.28 → upgini-1.2.29}/.gitignore +0 -0
  14. {upgini-1.2.28 → upgini-1.2.29}/LICENSE +0 -0
  15. {upgini-1.2.28 → upgini-1.2.29}/README.md +0 -0
  16. {upgini-1.2.28 → upgini-1.2.29}/pyproject.toml +0 -0
  17. {upgini-1.2.28 → upgini-1.2.29}/src/upgini/__init__.py +0 -0
  18. {upgini-1.2.28 → upgini-1.2.29}/src/upgini/ads.py +0 -0
  19. {upgini-1.2.28 → upgini-1.2.29}/src/upgini/ads_management/__init__.py +0 -0
  20. {upgini-1.2.28 → upgini-1.2.29}/src/upgini/ads_management/ads_manager.py +0 -0
  21. {upgini-1.2.28 → upgini-1.2.29}/src/upgini/autofe/__init__.py +0 -0
  22. {upgini-1.2.28 → upgini-1.2.29}/src/upgini/autofe/all_operands.py +0 -0
  23. {upgini-1.2.28 → upgini-1.2.29}/src/upgini/autofe/binary.py +0 -0
  24. {upgini-1.2.28 → upgini-1.2.29}/src/upgini/autofe/date.py +0 -0
  25. {upgini-1.2.28 → upgini-1.2.29}/src/upgini/autofe/feature.py +0 -0
  26. {upgini-1.2.28 → upgini-1.2.29}/src/upgini/autofe/groupby.py +0 -0
  27. {upgini-1.2.28 → upgini-1.2.29}/src/upgini/autofe/operand.py +0 -0
  28. {upgini-1.2.28 → upgini-1.2.29}/src/upgini/autofe/unary.py +0 -0
  29. {upgini-1.2.28 → upgini-1.2.29}/src/upgini/autofe/vector.py +0 -0
  30. {upgini-1.2.28 → upgini-1.2.29}/src/upgini/data_source/__init__.py +0 -0
  31. {upgini-1.2.28 → upgini-1.2.29}/src/upgini/data_source/data_source_publisher.py +0 -0
  32. {upgini-1.2.28 → upgini-1.2.29}/src/upgini/dataset.py +0 -0
  33. {upgini-1.2.28 → upgini-1.2.29}/src/upgini/errors.py +0 -0
  34. {upgini-1.2.28 → upgini-1.2.29}/src/upgini/lazy_import.py +0 -0
  35. {upgini-1.2.28 → upgini-1.2.29}/src/upgini/mdc/__init__.py +0 -0
  36. {upgini-1.2.28 → upgini-1.2.29}/src/upgini/mdc/context.py +0 -0
  37. {upgini-1.2.28 → upgini-1.2.29}/src/upgini/normalizer/__init__.py +0 -0
  38. {upgini-1.2.28 → upgini-1.2.29}/src/upgini/normalizer/normalize_utils.py +0 -0
  39. {upgini-1.2.28 → upgini-1.2.29}/src/upgini/resource_bundle/__init__.py +0 -0
  40. {upgini-1.2.28 → upgini-1.2.29}/src/upgini/resource_bundle/exceptions.py +0 -0
  41. {upgini-1.2.28 → upgini-1.2.29}/src/upgini/resource_bundle/strings_widget.properties +0 -0
  42. {upgini-1.2.28 → upgini-1.2.29}/src/upgini/sampler/__init__.py +0 -0
  43. {upgini-1.2.28 → upgini-1.2.29}/src/upgini/sampler/base.py +0 -0
  44. {upgini-1.2.28 → upgini-1.2.29}/src/upgini/sampler/random_under_sampler.py +0 -0
  45. {upgini-1.2.28 → upgini-1.2.29}/src/upgini/sampler/utils.py +0 -0
  46. {upgini-1.2.28 → upgini-1.2.29}/src/upgini/search_task.py +0 -0
  47. {upgini-1.2.28 → upgini-1.2.29}/src/upgini/spinner.py +0 -0
  48. {upgini-1.2.28 → upgini-1.2.29}/src/upgini/utils/Roboto-Regular.ttf +0 -0
  49. {upgini-1.2.28 → upgini-1.2.29}/src/upgini/utils/__init__.py +0 -0
  50. {upgini-1.2.28 → upgini-1.2.29}/src/upgini/utils/base_search_key_detector.py +0 -0
  51. {upgini-1.2.28 → upgini-1.2.29}/src/upgini/utils/blocked_time_series.py +0 -0
  52. {upgini-1.2.28 → upgini-1.2.29}/src/upgini/utils/country_utils.py +0 -0
  53. {upgini-1.2.28 → upgini-1.2.29}/src/upgini/utils/custom_loss_utils.py +0 -0
  54. {upgini-1.2.28 → upgini-1.2.29}/src/upgini/utils/cv_utils.py +0 -0
  55. {upgini-1.2.28 → upgini-1.2.29}/src/upgini/utils/deduplicate_utils.py +0 -0
  56. {upgini-1.2.28 → upgini-1.2.29}/src/upgini/utils/display_utils.py +0 -0
  57. {upgini-1.2.28 → upgini-1.2.29}/src/upgini/utils/fallback_progress_bar.py +0 -0
  58. {upgini-1.2.28 → upgini-1.2.29}/src/upgini/utils/format.py +0 -0
  59. {upgini-1.2.28 → upgini-1.2.29}/src/upgini/utils/ip_utils.py +0 -0
  60. {upgini-1.2.28 → upgini-1.2.29}/src/upgini/utils/phone_utils.py +0 -0
  61. {upgini-1.2.28 → upgini-1.2.29}/src/upgini/utils/postal_code_utils.py +0 -0
  62. {upgini-1.2.28 → upgini-1.2.29}/src/upgini/utils/progress_bar.py +0 -0
  63. {upgini-1.2.28 → upgini-1.2.29}/src/upgini/utils/sklearn_ext.py +0 -0
  64. {upgini-1.2.28 → upgini-1.2.29}/src/upgini/utils/target_utils.py +0 -0
  65. {upgini-1.2.28 → upgini-1.2.29}/src/upgini/utils/track_info.py +0 -0
  66. {upgini-1.2.28 → upgini-1.2.29}/src/upgini/utils/warning_counter.py +0 -0
  67. {upgini-1.2.28 → upgini-1.2.29}/src/upgini/version_validator.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: upgini
3
- Version: 1.2.28
3
+ Version: 1.2.29
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -0,0 +1 @@
1
+ __version__ = "1.2.29"
@@ -54,6 +54,7 @@ from upgini.metadata import (
54
54
  SYSTEM_RECORD_ID,
55
55
  TARGET,
56
56
  CVType,
57
+ FeaturesMetadataV2,
57
58
  FileColumnMeaningType,
58
59
  ModelTaskType,
59
60
  RuntimeParameters,
@@ -95,6 +96,7 @@ from upgini.utils.email_utils import (
95
96
  EmailSearchKeyConverter,
96
97
  EmailSearchKeyDetector,
97
98
  )
99
+ from upgini.utils.feature_info import FeatureInfo, _round_shap_value
98
100
  from upgini.utils.features_validator import FeaturesValidator
99
101
  from upgini.utils.format import Format
100
102
  from upgini.utils.ip_utils import IpSearchKeyConverter
@@ -158,6 +160,10 @@ class FeaturesEnricher(TransformerMixin):
158
160
 
159
161
  shared_datasets: list of str, optional (default=None)
160
162
  List of private shared dataset ids for custom search
163
+
164
+ select_features: bool, optional (default=False)
165
+ If True, return only selected features both from input and data sources.
166
+ Otherwise, return all features from input and only selected features from data sources.
161
167
  """
162
168
 
163
169
  TARGET_NAME = "target"
@@ -224,6 +230,7 @@ class FeaturesEnricher(TransformerMixin):
224
230
  client_visitorid: Optional[str] = None,
225
231
  custom_bundle_config: Optional[str] = None,
226
232
  add_date_if_missing: bool = True,
233
+ select_features: bool = False,
227
234
  **kwargs,
228
235
  ):
229
236
  self.bundle = get_custom_bundle(custom_bundle_config)
@@ -277,8 +284,11 @@ class FeaturesEnricher(TransformerMixin):
277
284
  self._relevant_data_sources_wo_links: pd.DataFrame = self.EMPTY_DATA_SOURCES
278
285
  self.metrics: Optional[pd.DataFrame] = None
279
286
  self.feature_names_ = []
287
+ self.dropped_client_feature_names_ = []
280
288
  self.feature_importances_ = []
281
289
  self.search_id = search_id
290
+ self.select_features = select_features
291
+
282
292
  if search_id:
283
293
  search_task = SearchTask(search_id, rest_client=self.rest_client, logger=self.logger)
284
294
 
@@ -999,9 +1009,10 @@ class FeaturesEnricher(TransformerMixin):
999
1009
  text_features=self.generate_features,
1000
1010
  has_date=has_date,
1001
1011
  )
1002
- etalon_metric, _ = baseline_estimator.cross_val_predict(
1012
+ etalon_cv_result = baseline_estimator.cross_val_predict(
1003
1013
  fitting_X, y_sorted, self.baseline_score_column
1004
1014
  )
1015
+ etalon_metric = etalon_cv_result.get_display_metric()
1005
1016
  if etalon_metric is None:
1006
1017
  self.logger.info(
1007
1018
  f"Baseline {metric} on train client features is None (maybe all features was removed)"
@@ -1033,9 +1044,9 @@ class FeaturesEnricher(TransformerMixin):
1033
1044
  text_features=self.generate_features,
1034
1045
  has_date=has_date,
1035
1046
  )
1036
- enriched_metric, enriched_shaps = enriched_estimator.cross_val_predict(
1037
- fitting_enriched_X, enriched_y_sorted
1038
- )
1047
+ enriched_cv_result = enriched_estimator.cross_val_predict(fitting_enriched_X, enriched_y_sorted)
1048
+ enriched_metric = enriched_cv_result.get_display_metric()
1049
+ enriched_shaps = enriched_cv_result.shap_values
1039
1050
 
1040
1051
  if enriched_shaps is not None:
1041
1052
  self._update_shap_values(enriched_shaps)
@@ -1048,7 +1059,7 @@ class FeaturesEnricher(TransformerMixin):
1048
1059
  else:
1049
1060
  self.logger.info(f"Enriched {metric} on train combined features: {enriched_metric}")
1050
1061
  if etalon_metric is not None and enriched_metric is not None:
1051
- uplift = (enriched_metric - etalon_metric) * multiplier
1062
+ uplift = (enriched_cv_result.metric - etalon_cv_result.metric) * multiplier
1052
1063
 
1053
1064
  train_metrics = {
1054
1065
  self.bundle.get("quality_metrics_segment_header"): self.bundle.get(
@@ -1091,9 +1102,10 @@ class FeaturesEnricher(TransformerMixin):
1091
1102
  f"Calculate baseline {metric} on eval set {idx + 1} "
1092
1103
  f"on client features: {eval_X_sorted.columns.to_list()}"
1093
1104
  )
1094
- etalon_eval_metric = baseline_estimator.calculate_metric(
1105
+ etalon_eval_results = baseline_estimator.calculate_metric(
1095
1106
  eval_X_sorted, eval_y_sorted, self.baseline_score_column
1096
1107
  )
1108
+ etalon_eval_metric = etalon_eval_results.get_display_metric()
1097
1109
  self.logger.info(
1098
1110
  f"Baseline {metric} on eval set {idx + 1} client features: {etalon_eval_metric}"
1099
1111
  )
@@ -1105,9 +1117,10 @@ class FeaturesEnricher(TransformerMixin):
1105
1117
  f"Calculate enriched {metric} on eval set {idx + 1} "
1106
1118
  f"on combined features: {enriched_eval_X_sorted.columns.to_list()}"
1107
1119
  )
1108
- enriched_eval_metric = enriched_estimator.calculate_metric(
1120
+ enriched_eval_results = enriched_estimator.calculate_metric(
1109
1121
  enriched_eval_X_sorted, enriched_eval_y_sorted
1110
1122
  )
1123
+ enriched_eval_metric = enriched_eval_results.get_display_metric()
1111
1124
  self.logger.info(
1112
1125
  f"Enriched {metric} on eval set {idx + 1} combined features: {enriched_eval_metric}"
1113
1126
  )
@@ -1115,7 +1128,7 @@ class FeaturesEnricher(TransformerMixin):
1115
1128
  enriched_eval_metric = None
1116
1129
 
1117
1130
  if etalon_eval_metric is not None and enriched_eval_metric is not None:
1118
- eval_uplift = (enriched_eval_metric - etalon_eval_metric) * multiplier
1131
+ eval_uplift = (enriched_eval_results.metric - etalon_eval_results.metric) * multiplier
1119
1132
  else:
1120
1133
  eval_uplift = None
1121
1134
 
@@ -1198,9 +1211,7 @@ class FeaturesEnricher(TransformerMixin):
1198
1211
 
1199
1212
  def _update_shap_values(self, new_shaps: Dict[str, float]):
1200
1213
  new_shaps = {
1201
- feature: self._round_shap_value(shap)
1202
- for feature, shap in new_shaps.items()
1203
- if feature in self.feature_names_
1214
+ feature: _round_shap_value(shap) for feature, shap in new_shaps.items() if feature in self.feature_names_
1204
1215
  }
1205
1216
  features_importances = list(new_shaps.items())
1206
1217
  features_importances.sort(key=lambda m: (-m[1], m[0]))
@@ -1249,7 +1260,7 @@ class FeaturesEnricher(TransformerMixin):
1249
1260
  display_html_dataframe(
1250
1261
  self.relevant_data_sources,
1251
1262
  self._relevant_data_sources_wo_links,
1252
- self.bundle.get("relevant_features_header"),
1263
+ self.bundle.get("relevant_data_sources_header"),
1253
1264
  display_handle=self.data_sources_display_handle,
1254
1265
  )
1255
1266
  except (ImportError, NameError):
@@ -1437,7 +1448,12 @@ class FeaturesEnricher(TransformerMixin):
1437
1448
  client_features = [
1438
1449
  c
1439
1450
  for c in X_sampled.columns.to_list()
1440
- if c
1451
+ if (
1452
+ not self.select_features
1453
+ or c in self.feature_names_
1454
+ or (self.fit_columns_renaming is not None and self.fit_columns_renaming.get(c) in self.feature_names_)
1455
+ )
1456
+ and c
1441
1457
  not in (
1442
1458
  excluding_search_keys
1443
1459
  + list(self.fit_dropped_features)
@@ -1653,7 +1669,10 @@ class FeaturesEnricher(TransformerMixin):
1653
1669
  generated_features = []
1654
1670
  if date_column is not None:
1655
1671
  converter = DateTimeSearchKeyConverter(date_column, self.date_format, self.logger, self.bundle)
1656
- df = converter.convert(df, keep_time=True)
1672
+ # Leave original date column values
1673
+ df_with_date_features = converter.convert(df, keep_time=True)
1674
+ df_with_date_features[date_column] = df[date_column]
1675
+ df = df_with_date_features
1657
1676
  generated_features = converter.generated_features
1658
1677
 
1659
1678
  email_columns = SearchKey.find_all_keys(search_keys, SearchKey.EMAIL)
@@ -1662,9 +1681,10 @@ class FeaturesEnricher(TransformerMixin):
1662
1681
  df = generator.generate(df)
1663
1682
  generated_features.extend(generator.generated_features)
1664
1683
 
1665
- normalizer = Normalizer(self.bundle, self.logger)
1666
- df, search_keys, generated_features = normalizer.normalize(df, search_keys, generated_features)
1667
- columns_renaming = normalizer.columns_renaming
1684
+ # normalizer = Normalizer(self.bundle, self.logger)
1685
+ # df, search_keys, generated_features = normalizer.normalize(df, search_keys, generated_features)
1686
+ # columns_renaming = normalizer.columns_renaming
1687
+ columns_renaming = {c: c for c in df.columns}
1668
1688
 
1669
1689
  df, _ = clean_full_duplicates(df, logger=self.logger, bundle=self.bundle)
1670
1690
 
@@ -1980,9 +2000,19 @@ class FeaturesEnricher(TransformerMixin):
1980
2000
  file_metadata = self._search_task.get_file_metadata(str(uuid.uuid4()))
1981
2001
  search_keys = file_metadata.search_types()
1982
2002
  if SearchKey.IPV6_ADDRESS in search_keys:
1983
- search_keys.remove(SearchKey.IPV6_ADDRESS)
2003
+ # search_keys.remove(SearchKey.IPV6_ADDRESS)
2004
+ search_keys.pop(SearchKey.IPV6_ADDRESS, None)
1984
2005
 
1985
- keys = "{" + ", ".join([f'"{key.name}": "{key_example(key)}"' for key in search_keys]) + "}"
2006
+ keys = (
2007
+ "{"
2008
+ + ", ".join(
2009
+ [
2010
+ f'"{key.name}": {{"name": "{name}", "value": "{key_example(key)}"}}'
2011
+ for key, name in search_keys.items()
2012
+ ]
2013
+ )
2014
+ + "}"
2015
+ )
1986
2016
  features_for_transform = self._search_task.get_features_for_transform()
1987
2017
  if features_for_transform:
1988
2018
  original_features_for_transform = [
@@ -2063,7 +2093,9 @@ class FeaturesEnricher(TransformerMixin):
2063
2093
 
2064
2094
  is_demo_dataset = hash_input(validated_X) in DEMO_DATASET_HASHES
2065
2095
 
2066
- columns_to_drop = [c for c in validated_X.columns if c in self.feature_names_]
2096
+ columns_to_drop = [
2097
+ c for c in validated_X.columns if c in self.feature_names_ and c in self.dropped_client_feature_names_
2098
+ ]
2067
2099
  if len(columns_to_drop) > 0:
2068
2100
  msg = self.bundle.get("x_contains_enriching_columns").format(columns_to_drop)
2069
2101
  self.logger.warning(msg)
@@ -2092,7 +2124,7 @@ class FeaturesEnricher(TransformerMixin):
2092
2124
  date_column = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
2093
2125
  if date_column is not None:
2094
2126
  converter = DateTimeSearchKeyConverter(date_column, self.date_format, self.logger, bundle=self.bundle)
2095
- df = converter.convert(df)
2127
+ df = converter.convert(df, keep_time=True)
2096
2128
  self.logger.info(f"Date column after convertion: {df[date_column]}")
2097
2129
  generated_features.extend(converter.generated_features)
2098
2130
  else:
@@ -2187,11 +2219,12 @@ class FeaturesEnricher(TransformerMixin):
2187
2219
 
2188
2220
  if add_fit_system_record_id:
2189
2221
  df = self.__add_fit_system_record_id(df, search_keys, SYSTEM_RECORD_ID)
2190
- if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
2191
- df = df.drop(columns=DateTimeSearchKeyConverter.DATETIME_COL)
2192
2222
  df = df.rename(columns={SYSTEM_RECORD_ID: SORT_ID})
2193
2223
  features_not_to_pass.append(SORT_ID)
2194
2224
 
2225
+ if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
2226
+ df = df.drop(columns=DateTimeSearchKeyConverter.DATETIME_COL)
2227
+
2195
2228
  # search keys might be changed after explode
2196
2229
  columns_for_system_record_id = sorted(list(search_keys.keys()) + features_for_transform)
2197
2230
  df[SYSTEM_RECORD_ID] = pd.util.hash_pandas_object(df[columns_for_system_record_id], index=False).astype(
@@ -2210,7 +2243,7 @@ class FeaturesEnricher(TransformerMixin):
2210
2243
 
2211
2244
  combined_search_keys = combine_search_keys(search_keys.keys())
2212
2245
 
2213
- df_without_features = df.drop(columns=features_not_to_pass)
2246
+ df_without_features = df.drop(columns=features_not_to_pass, errors="ignore")
2214
2247
 
2215
2248
  df_without_features, full_duplicates_warning = clean_full_duplicates(
2216
2249
  df_without_features, self.logger, bundle=self.bundle
@@ -2319,11 +2352,15 @@ class FeaturesEnricher(TransformerMixin):
2319
2352
  else:
2320
2353
  result = enrich()
2321
2354
 
2355
+ selecting_columns = [
2356
+ c
2357
+ for c in itertools.chain(validated_X.columns.tolist(), generated_features)
2358
+ if c not in self.dropped_client_feature_names_
2359
+ ]
2322
2360
  filtered_columns = self.__filtered_enriched_features(importance_threshold, max_features)
2323
- existing_filtered_columns = [
2361
+ selecting_columns.extend(
2324
2362
  c for c in filtered_columns if c in result.columns and c not in validated_X.columns
2325
- ]
2326
- selecting_columns = validated_X.columns.tolist() + generated_features + existing_filtered_columns
2363
+ )
2327
2364
  if add_fit_system_record_id:
2328
2365
  selecting_columns.append(SORT_ID)
2329
2366
 
@@ -3490,15 +3527,7 @@ class FeaturesEnricher(TransformerMixin):
3490
3527
 
3491
3528
  return result_train, result_eval_sets
3492
3529
 
3493
- @staticmethod
3494
- def _round_shap_value(shap: float) -> float:
3495
- if shap > 0.0 and shap < 0.0001:
3496
- return 0.0001
3497
- else:
3498
- return round(shap, 4)
3499
-
3500
3530
  def __prepare_feature_importances(self, trace_id: str, x_columns: List[str], silent=False):
3501
- llm_source = "LLM with external data augmentation"
3502
3531
  if self._search_task is None:
3503
3532
  raise NotFittedError(self.bundle.get("transform_unfitted_enricher"))
3504
3533
  features_meta = self._search_task.get_all_features_metadata_v2()
@@ -3509,116 +3538,40 @@ class FeaturesEnricher(TransformerMixin):
3509
3538
  features_df = self._search_task.get_all_initial_raw_features(trace_id, metrics_calculation=True)
3510
3539
 
3511
3540
  self.feature_names_ = []
3541
+ self.dropped_client_feature_names_ = []
3512
3542
  self.feature_importances_ = []
3513
3543
  features_info = []
3514
3544
  features_info_without_links = []
3515
3545
  internal_features_info = []
3516
3546
 
3517
- def list_or_single(lst: List[str], single: str):
3518
- return lst or ([single] if single else [])
3519
-
3520
- def to_anchor(link: str, value: str) -> str:
3521
- if not value:
3522
- return ""
3523
- elif not link:
3524
- return value
3525
- elif value == llm_source:
3526
- return value
3527
- else:
3528
- return f"<a href='{link}' target='_blank' rel='noopener noreferrer'>{value}</a>"
3529
-
3530
- def make_links(names: List[str], links: List[str]):
3531
- all_links = [to_anchor(link, name) for name, link in itertools.zip_longest(names, links)]
3532
- return ",".join(all_links)
3533
-
3534
3547
  features_meta.sort(key=lambda m: (-m.shap_value, m.name))
3535
3548
  for feature_meta in features_meta:
3536
3549
  if feature_meta.name in original_names_dict.keys():
3537
3550
  feature_meta.name = original_names_dict[feature_meta.name]
3538
- # Use only enriched features
3551
+
3552
+ is_client_feature = feature_meta.name in x_columns
3553
+
3554
+ if feature_meta.shap_value == 0.0:
3555
+ if self.select_features:
3556
+ self.dropped_client_feature_names_.append(feature_meta.name)
3557
+ continue
3558
+
3559
+ # Use only important features
3539
3560
  if (
3540
- feature_meta.name in x_columns
3561
+ feature_meta.name in self.fit_generated_features
3541
3562
  or feature_meta.name == COUNTRY
3542
- or feature_meta.shap_value == 0.0
3543
- or feature_meta.name in self.fit_generated_features
3563
+ # In select_features mode we select also from etalon features and need to show them
3564
+ or (not self.select_features and is_client_feature)
3544
3565
  ):
3545
3566
  continue
3546
3567
 
3547
- feature_sample = []
3548
3568
  self.feature_names_.append(feature_meta.name)
3549
- self.feature_importances_.append(self._round_shap_value(feature_meta.shap_value))
3550
- if feature_meta.name in features_df.columns:
3551
- feature_sample = np.random.choice(features_df[feature_meta.name].dropna().unique(), 3).tolist()
3552
- if len(feature_sample) > 0 and isinstance(feature_sample[0], float):
3553
- feature_sample = [round(f, 4) for f in feature_sample]
3554
- feature_sample = [str(f) for f in feature_sample]
3555
- feature_sample = ", ".join(feature_sample)
3556
- if len(feature_sample) > 30:
3557
- feature_sample = feature_sample[:30] + "..."
3558
-
3559
- internal_provider = feature_meta.data_provider or "Upgini"
3560
- providers = list_or_single(feature_meta.data_providers, feature_meta.data_provider)
3561
- provider_links = list_or_single(feature_meta.data_provider_links, feature_meta.data_provider_link)
3562
- if providers:
3563
- provider = make_links(providers, provider_links)
3564
- else:
3565
- provider = to_anchor("https://upgini.com", "Upgini")
3569
+ self.feature_importances_.append(_round_shap_value(feature_meta.shap_value))
3566
3570
 
3567
- internal_source = feature_meta.data_source or (
3568
- llm_source
3569
- if not feature_meta.name.endswith("_country") and not feature_meta.name.endswith("_postal_code")
3570
- else ""
3571
- )
3572
- sources = list_or_single(feature_meta.data_sources, feature_meta.data_source)
3573
- source_links = list_or_single(feature_meta.data_source_links, feature_meta.data_source_link)
3574
- if sources:
3575
- source = make_links(sources, source_links)
3576
- else:
3577
- source = internal_source
3578
-
3579
- internal_feature_name = feature_meta.name
3580
- if feature_meta.doc_link:
3581
- feature_name = to_anchor(feature_meta.doc_link, feature_meta.name)
3582
- else:
3583
- feature_name = internal_feature_name
3584
-
3585
- features_info.append(
3586
- {
3587
- self.bundle.get("features_info_name"): feature_name,
3588
- self.bundle.get("features_info_shap"): self._round_shap_value(feature_meta.shap_value),
3589
- self.bundle.get("features_info_hitrate"): feature_meta.hit_rate,
3590
- self.bundle.get("features_info_value_preview"): feature_sample,
3591
- self.bundle.get("features_info_provider"): provider,
3592
- self.bundle.get("features_info_source"): source,
3593
- self.bundle.get("features_info_update_frequency"): feature_meta.update_frequency,
3594
- }
3595
- )
3596
- features_info_without_links.append(
3597
- {
3598
- self.bundle.get("features_info_name"): internal_feature_name,
3599
- self.bundle.get("features_info_shap"): self._round_shap_value(feature_meta.shap_value),
3600
- self.bundle.get("features_info_hitrate"): feature_meta.hit_rate,
3601
- self.bundle.get("features_info_value_preview"): feature_sample,
3602
- self.bundle.get("features_info_provider"): internal_provider,
3603
- self.bundle.get("features_info_source"): internal_source,
3604
- self.bundle.get("features_info_update_frequency"): feature_meta.update_frequency,
3605
- }
3606
- )
3607
- internal_features_info.append(
3608
- {
3609
- self.bundle.get("features_info_name"): internal_feature_name,
3610
- "feature_link": feature_meta.doc_link,
3611
- self.bundle.get("features_info_shap"): self._round_shap_value(feature_meta.shap_value),
3612
- self.bundle.get("features_info_hitrate"): feature_meta.hit_rate,
3613
- self.bundle.get("features_info_value_preview"): feature_sample,
3614
- self.bundle.get("features_info_provider"): internal_provider,
3615
- "provider_link": feature_meta.data_provider_link,
3616
- self.bundle.get("features_info_source"): internal_source,
3617
- "source_link": feature_meta.data_source_link,
3618
- self.bundle.get("features_info_commercial_schema"): feature_meta.commercial_schema or "",
3619
- self.bundle.get("features_info_update_frequency"): feature_meta.update_frequency,
3620
- }
3621
- )
3571
+ feature_info = FeatureInfo.from_metadata(feature_meta, features_df, is_client_feature)
3572
+ features_info.append(feature_info.to_row(self.bundle))
3573
+ features_info_without_links.append(feature_info.to_row_without_links(self.bundle))
3574
+ internal_features_info.append(feature_info.to_internal_row(self.bundle))
3622
3575
 
3623
3576
  if len(features_info) > 0:
3624
3577
  self.features_info = pd.DataFrame(features_info)
@@ -3643,7 +3596,22 @@ class FeaturesEnricher(TransformerMixin):
3643
3596
  autofe_meta = self._search_task.get_autofe_metadata()
3644
3597
  if autofe_meta is None:
3645
3598
  return None
3646
- features_meta = self._search_task.get_all_features_metadata_v2()
3599
+ if len(self._internal_features_info) != 0:
3600
+
3601
+ def to_feature_meta(row):
3602
+ fm = FeaturesMetadataV2(
3603
+ name=row[bundle.get("features_info_name")],
3604
+ type="",
3605
+ source="",
3606
+ hit_rate=bundle.get("features_info_hitrate"),
3607
+ shap_value=bundle.get("features_info_shap"),
3608
+ data_source=bundle.get("features_info_source"),
3609
+ )
3610
+ return fm
3611
+
3612
+ features_meta = self._internal_features_info.apply(to_feature_meta).to_list()
3613
+ else:
3614
+ features_meta = self._search_task.get_all_features_metadata_v2()
3647
3615
 
3648
3616
  def get_feature_by_name(name: str):
3649
3617
  for m in features_meta:
@@ -882,7 +882,7 @@ class _RestClient:
882
882
  if content_type:
883
883
  headers[_RestClient.CONTENT_TYPE_HEADER_NAME] = content_type
884
884
  if trace_id:
885
- headers[_RestClient.TRACE_ID_HEADER_NAME] = trace_id
885
+ headers[_RestClient.TRACE_ID_HEADER_NAME] = str(trace_id)
886
886
  for header_key, header_value in additional_headers.items():
887
887
  headers[header_key] = header_value
888
888
  return headers
@@ -1,7 +1,7 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  from enum import Enum
4
- from typing import Any, Dict, List, Optional, Set, Union
4
+ from typing import Any, Dict, List, Optional, Union
5
5
 
6
6
  from pydantic import BaseModel
7
7
 
@@ -228,13 +228,13 @@ class FileMetadata(BaseModel):
228
228
  return c
229
229
  return None
230
230
 
231
- def search_types(self) -> Set[SearchKey]:
232
- search_keys = set()
231
+ def search_types(self) -> Dict[SearchKey, str]:
232
+ search_keys = dict()
233
233
  for keys_group in self.searchKeys:
234
234
  for key in keys_group:
235
235
  column = self.column_by_name(key)
236
236
  if column:
237
- search_keys.add(SearchKey.from_meaning_type(column.meaningType))
237
+ search_keys[SearchKey.from_meaning_type(column.meaningType)] = column.name
238
238
  return search_keys
239
239
 
240
240
 
@@ -1,5 +1,6 @@
1
1
  from __future__ import annotations
2
2
 
3
+ from dataclasses import dataclass
3
4
  import inspect
4
5
  import logging
5
6
  import re
@@ -210,6 +211,21 @@ SUPPORTED_CATBOOST_METRICS = {
210
211
  }
211
212
 
212
213
 
214
+ @dataclass
215
+ class _CrossValResults:
216
+ metric: Optional[float]
217
+ metric_std: Optional[float]
218
+ shap_values: Optional[Dict[str, float]]
219
+
220
+ def get_display_metric(self) -> Optional[str]:
221
+ if self.metric is None:
222
+ return None
223
+ elif self.metric_std is None:
224
+ return f"{self.metric:.3f}"
225
+ else:
226
+ return f"{self.metric:.3f} ± {self.metric_std:.3f}"
227
+
228
+
213
229
  class EstimatorWrapper:
214
230
  def __init__(
215
231
  self,
@@ -297,11 +313,11 @@ class EstimatorWrapper:
297
313
 
298
314
  def cross_val_predict(
299
315
  self, x: pd.DataFrame, y: np.ndarray, baseline_score_column: Optional[Any] = None
300
- ) -> Tuple[Optional[float], Optional[Dict[str, float]]]:
316
+ ) -> _CrossValResults:
301
317
  x, y, groups, fit_params = self._prepare_to_fit(x, y)
302
318
 
303
319
  if x.shape[1] == 0:
304
- return None
320
+ return _CrossValResults(metric=None, metric_std=None, shap_values=None)
305
321
 
306
322
  scorer = check_scoring(self.estimator, scoring=self.scorer)
307
323
 
@@ -326,7 +342,7 @@ class EstimatorWrapper:
326
342
 
327
343
  self.check_fold_metrics(metrics_by_fold)
328
344
 
329
- metric = np.mean(metrics_by_fold) * self.multiplier
345
+ metric, metric_std = self._calculate_metric_from_folds(metrics_by_fold)
330
346
 
331
347
  splits = self.cv.split(x, y, groups)
332
348
 
@@ -351,7 +367,7 @@ class EstimatorWrapper:
351
367
  else:
352
368
  average_shap_values = None
353
369
 
354
- return self.post_process_metric(metric), average_shap_values
370
+ return _CrossValResults(metric=metric, metric_std=metric_std, shap_values=average_shap_values)
355
371
 
356
372
  def process_shap_values(self, shap_values: Dict[str, float]) -> Dict[str, float]:
357
373
  return shap_values
@@ -367,17 +383,25 @@ class EstimatorWrapper:
367
383
  metric = 2 * metric - 1
368
384
  return metric
369
385
 
370
- def calculate_metric(self, x: pd.DataFrame, y: np.ndarray, baseline_score_column: Optional[Any] = None) -> float:
386
+ def calculate_metric(
387
+ self, x: pd.DataFrame, y: np.ndarray, baseline_score_column: Optional[Any] = None
388
+ ) -> _CrossValResults:
371
389
  x, y, _ = self._prepare_to_calculate(x, y)
372
390
  if baseline_score_column is not None and self.metric_name == "GINI":
373
- metric = roc_auc_score(y, x[baseline_score_column])
391
+ metric, metric_std = roc_auc_score(y, x[baseline_score_column]), None
374
392
  else:
375
393
  metrics = []
376
394
  for est in self.cv_estimators:
377
395
  metrics.append(self.scorer(est, x, y))
378
396
 
379
- metric = np.mean(metrics) * self.multiplier
380
- return self.post_process_metric(metric)
397
+ metric, metric_std = self._calculate_metric_from_folds(metrics)
398
+ return _CrossValResults(metric=metric, metric_std=metric_std, shap_values=None)
399
+
400
+ def _calculate_metric_from_folds(self, metrics_by_fold: List[float]) -> Tuple[float, float]:
401
+ metrics_by_fold = [self.post_process_metric(m) for m in metrics_by_fold]
402
+ metric = np.mean(metrics_by_fold) * self.multiplier
403
+ metric_std = np.std(metrics_by_fold) * np.abs(self.multiplier)
404
+ return metric, metric_std
381
405
 
382
406
  @staticmethod
383
407
  def create(
@@ -591,7 +615,7 @@ class CatBoostWrapper(EstimatorWrapper):
591
615
 
592
616
  def cross_val_predict(
593
617
  self, x: pd.DataFrame, y: np.ndarray, baseline_score_column: Optional[Any] = None
594
- ) -> Tuple[Optional[float], Optional[Dict[str, float]]]:
618
+ ) -> _CrossValResults:
595
619
  try:
596
620
  return super().cross_val_predict(x, y, baseline_score_column)
597
621
  except Exception as e:
@@ -82,7 +82,7 @@ unregistered_only_personal_keys=Only personal search keys used. Api_key from pro
82
82
  search_key_not_found=Column `{}` from search_keys was not found in X dataframe: {}
83
83
  numeric_search_key_not_found=Index {} in search_keys is out of bounds for {} columns of X dataframe
84
84
  unsupported_search_key_type=Unsupported type of key in search_keys: {}
85
- unsupported_type_of_search_key=Unsupported type of search key: {}. It should be a member of SearcKey
85
+ unsupported_type_of_search_key=Unsupported type of search key: {}. It should be a member of SearchKey
86
86
  search_key_country_and_country_code=SearchKey.COUNTRY and country_code parameter were passed simultaniously. Parameter country_code will be ignored
87
87
  empty_search_key=Search key {} is empty. Please fill values or remove this search key
88
88
  single_constant_search_key=Constant value detected for the {} search key in the X dataframe: {}.\nThat search key will add constant features for different y values.\nPlease add extra search keys with non constant values, like the COUNTRY, POSTAL_CODE, DATE, PHONE NUMBER, EMAIL/HEM or IPv4
@@ -109,20 +109,63 @@ class DateTimeSearchKeyConverter:
109
109
 
110
110
  df = self.clean_old_dates(df)
111
111
 
112
+ # Define function to apply sine and cosine transformations
113
+ def add_cyclical_features(df, column, period):
114
+ period_suffix = f"_{period}" if column != "day_in_quarter" else ""
115
+ sin_feature = f"datetime_{column}_sin{period_suffix}"
116
+ cos_feature = f"datetime_{column}_cos{period_suffix}"
117
+ if sin_feature not in df.columns:
118
+ df[sin_feature] = np.sin(2 * np.pi * df[column] / period)
119
+ self.generated_features.append(sin_feature)
120
+ if cos_feature not in df.columns:
121
+ df[cos_feature] = np.cos(2 * np.pi * df[column] / period)
122
+ self.generated_features.append(cos_feature)
123
+
124
+ df["quarter"] = df[self.date_column].dt.quarter
125
+
126
+ # Calculate the start date of the quarter for each timestamp
127
+ df["quarter_start"] = df[self.date_column].dt.to_period("Q").dt.start_time
128
+
129
+ # Calculate the day in the quarter
130
+ df["day_in_quarter"] = (df[self.date_column] - df["quarter_start"]).dt.days + 1
131
+
132
+ # Vectorized calculation of days_in_quarter
133
+ quarter = df["quarter"]
134
+ start = df["quarter_start"]
135
+ year = start.dt.year
136
+ month = start.dt.month
137
+
138
+ quarter_end_year = np.where(quarter == 4, year + 1, year)
139
+ quarter_end_month = np.where(quarter == 4, 1, month + 3)
140
+
141
+ end = pd.to_datetime({"year": quarter_end_year, "month": quarter_end_month, "day": 1})
142
+ end.index = df.index
143
+
144
+ df["days_in_quarter"] = (end - start).dt.days
145
+
146
+ add_cyclical_features(df, "day_in_quarter", df["days_in_quarter"]) # Days in the quarter
147
+
148
+ df.drop(columns=["quarter", "quarter_start", "day_in_quarter", "days_in_quarter"], inplace=True)
149
+
112
150
  df[seconds] = (df[self.date_column] - df[self.date_column].dt.floor("D")).dt.seconds
113
151
 
114
152
  seconds_without_na = df[seconds].dropna()
115
153
  if (seconds_without_na != 0).any() and seconds_without_na.nunique() > 1:
116
154
  self.logger.info("Time found in date search key. Add extra features based on time")
117
- seconds_in_day = 60 * 60 * 24
118
- orders = [1, 2, 24, 48]
119
- for order in orders:
120
- sin_feature = f"datetime_time_sin_{order}"
121
- cos_feature = f"datetime_time_cos_{order}"
122
- df[sin_feature] = np.round(np.sin(2 * np.pi * order * df[seconds] / seconds_in_day), 10)
123
- df[cos_feature] = np.round(np.cos(2 * np.pi * order * df[seconds] / seconds_in_day), 10)
124
- self.generated_features.append(sin_feature)
125
- self.generated_features.append(cos_feature)
155
+
156
+ # Extract basic components
157
+ df["second"] = df[self.date_column].dt.second
158
+ df["minute"] = df[self.date_column].dt.minute
159
+ df["hour"] = df[self.date_column].dt.hour
160
+
161
+ # Apply cyclical transformations
162
+ add_cyclical_features(df, "second", 60) # Seconds in a minute
163
+ add_cyclical_features(df, "minute", 60) # Minutes in an hour
164
+ add_cyclical_features(df, "minute", 30) # Minutes in half an hour
165
+ add_cyclical_features(df, "hour", 24) # Hours in a day
166
+
167
+ # Drop intermediate columns if not needed
168
+ df.drop(columns=["second", "minute", "hour"], inplace=True)
126
169
 
127
170
  df.drop(columns=seconds, inplace=True)
128
171
 
@@ -38,8 +38,9 @@ class EmailDomainGenerator:
38
38
  def generate(self, df: pd.DataFrame) -> pd.DataFrame:
39
39
  for email_col in self.email_columns:
40
40
  domain_feature = email_col + self.DOMAIN_SUFFIX
41
- df[domain_feature] = df[email_col].apply(self._email_to_domain)
42
- self.generated_features.append(domain_feature)
41
+ if domain_feature not in df.columns:
42
+ df[domain_feature] = df[email_col].apply(self._email_to_domain).astype("string")
43
+ self.generated_features.append(domain_feature)
43
44
  return df
44
45
 
45
46
  @staticmethod
@@ -0,0 +1,172 @@
1
+ from dataclasses import dataclass
2
+ import itertools
3
+ from typing import Dict, List
4
+
5
+ import numpy as np
6
+ import pandas as pd
7
+
8
+ from upgini.metadata import FeaturesMetadataV2
9
+ from upgini.resource_bundle import ResourceBundle
10
+
11
+
12
+ LLM_SOURCE = "LLM with external data augmentation"
13
+
14
+
15
+ @dataclass
16
+ class FeatureInfo:
17
+ name: str
18
+ internal_name: str
19
+ rounded_shap: float
20
+ hitrate: float
21
+ value_preview: str
22
+ provider: str
23
+ internal_provider: str
24
+ source: str
25
+ internal_source: str
26
+ update_frequency: str
27
+ commercial_schema: str
28
+ doc_link: str
29
+ data_provider_link: str
30
+ data_source_link: str
31
+
32
+ @staticmethod
33
+ def from_metadata(feature_meta: FeaturesMetadataV2, data: pd.DataFrame, is_client_feature: bool) -> "FeatureInfo":
34
+ return FeatureInfo(
35
+ name=_get_name(feature_meta),
36
+ internal_name=_get_internal_name(feature_meta),
37
+ rounded_shap=_round_shap_value(feature_meta.shap_value),
38
+ hitrate=feature_meta.hit_rate,
39
+ value_preview=_get_feature_sample(feature_meta, data),
40
+ provider=_get_provider(feature_meta, is_client_feature),
41
+ internal_provider=_get_internal_provider(feature_meta, is_client_feature),
42
+ source=_get_source(feature_meta, is_client_feature),
43
+ internal_source=_get_internal_source(feature_meta, is_client_feature),
44
+ update_frequency=feature_meta.update_frequency,
45
+ commercial_schema=feature_meta.commercial_schema,
46
+ doc_link=feature_meta.doc_link,
47
+ data_provider_link=feature_meta.data_provider_link,
48
+ data_source_link=feature_meta.data_source_link,
49
+ )
50
+
51
+ def to_row(self, bundle: ResourceBundle) -> Dict[str, str]:
52
+ return {
53
+ bundle.get("features_info_name"): self.name,
54
+ bundle.get("features_info_shap"): self.rounded_shap,
55
+ bundle.get("features_info_hitrate"): self.hitrate,
56
+ bundle.get("features_info_value_preview"): self.value_preview,
57
+ bundle.get("features_info_provider"): self.provider,
58
+ bundle.get("features_info_source"): self.source,
59
+ bundle.get("features_info_update_frequency"): self.update_frequency,
60
+ }
61
+
62
+ def to_row_without_links(self, bundle: ResourceBundle) -> Dict[str, str]:
63
+ return {
64
+ bundle.get("features_info_name"): self.internal_name,
65
+ bundle.get("features_info_shap"): self.rounded_shap,
66
+ bundle.get("features_info_hitrate"): self.hitrate,
67
+ bundle.get("features_info_value_preview"): self.value_preview,
68
+ bundle.get("features_info_provider"): self.internal_provider,
69
+ bundle.get("features_info_source"): self.internal_source,
70
+ bundle.get("features_info_update_frequency"): self.update_frequency,
71
+ }
72
+
73
+ def to_internal_row(self, bundle: ResourceBundle) -> Dict[str, str]:
74
+ return {
75
+ bundle.get("features_info_name"): self.internal_name,
76
+ "feature_link": self.doc_link,
77
+ bundle.get("features_info_shap"): self.rounded_shap,
78
+ bundle.get("features_info_hitrate"): self.hitrate,
79
+ bundle.get("features_info_value_preview"): self.value_preview,
80
+ bundle.get("features_info_provider"): self.internal_provider,
81
+ "provider_link": self.data_provider_link,
82
+ bundle.get("features_info_source"): self.internal_source,
83
+ "source_link": self.data_source_link,
84
+ bundle.get("features_info_commercial_schema"): self.commercial_schema or "",
85
+ bundle.get("features_info_update_frequency"): self.update_frequency,
86
+ }
87
+
88
+
89
+ def _get_feature_sample(feature_meta: FeaturesMetadataV2, data: pd.DataFrame) -> str:
90
+ if feature_meta.name in data.columns:
91
+ feature_sample = np.random.choice(data[feature_meta.name].dropna().unique(), 3).tolist()
92
+ if len(feature_sample) > 0 and isinstance(feature_sample[0], float):
93
+ feature_sample = [round(f, 4) for f in feature_sample]
94
+ feature_sample = [str(f) for f in feature_sample]
95
+ feature_sample = ", ".join(feature_sample)
96
+ if len(feature_sample) > 30:
97
+ feature_sample = feature_sample[:30] + "..."
98
+ else:
99
+ feature_sample = ""
100
+ return feature_sample
101
+
102
+
103
+ def _get_name(feature_meta: FeaturesMetadataV2) -> str:
104
+ if feature_meta.doc_link:
105
+ return _to_anchor(feature_meta.doc_link, feature_meta.name)
106
+ else:
107
+ return feature_meta.name
108
+
109
+
110
+ def _get_internal_name(feature_meta: FeaturesMetadataV2) -> str:
111
+ return feature_meta.name
112
+
113
+
114
+ def _get_provider(feature_meta: FeaturesMetadataV2, is_client_feature: bool) -> str:
115
+ providers = _list_or_single(feature_meta.data_providers, feature_meta.data_provider)
116
+ provider_links = _list_or_single(feature_meta.data_provider_links, feature_meta.data_provider_link)
117
+ if providers:
118
+ provider = _make_links(providers, provider_links)
119
+ else:
120
+ provider = "" if is_client_feature else _to_anchor("https://upgini.com", "Upgini")
121
+ return provider
122
+
123
+
124
+ def _get_internal_provider(feature_meta: FeaturesMetadataV2, is_client_feature: bool) -> str:
125
+ return "" if is_client_feature else (feature_meta.data_provider or "Upgini")
126
+
127
+
128
+ def _get_source(feature_meta: FeaturesMetadataV2, is_client_feature: bool) -> str:
129
+ sources = _list_or_single(feature_meta.data_sources, feature_meta.data_source)
130
+ source_links = _list_or_single(feature_meta.data_source_links, feature_meta.data_source_link)
131
+ if sources:
132
+ source = _make_links(sources, source_links)
133
+ else:
134
+ source = _get_internal_source(feature_meta, is_client_feature)
135
+ return source
136
+
137
+
138
+ def _get_internal_source(feature_meta: FeaturesMetadataV2, is_client_feature: bool) -> str:
139
+ return feature_meta.data_source or (
140
+ LLM_SOURCE
141
+ if not feature_meta.name.endswith("_country")
142
+ and not feature_meta.name.endswith("_postal_code")
143
+ and not is_client_feature
144
+ else ""
145
+ )
146
+
147
+
148
+ def _list_or_single(lst: List[str], single: str):
149
+ return lst or ([single] if single else [])
150
+
151
+
152
+ def _to_anchor(link: str, value: str) -> str:
153
+ if not value:
154
+ return ""
155
+ elif not link:
156
+ return value
157
+ elif value == LLM_SOURCE:
158
+ return value
159
+ else:
160
+ return f"<a href='{link}' target='_blank' rel='noopener noreferrer'>{value}</a>"
161
+
162
+
163
+ def _make_links(names: List[str], links: List[str]):
164
+ all_links = [_to_anchor(link, name) for name, link in itertools.zip_longest(names, links)]
165
+ return ",".join(all_links)
166
+
167
+
168
+ def _round_shap_value(shap: float) -> float:
169
+ if shap > 0.0 and shap < 0.0001:
170
+ return 0.0001
171
+ else:
172
+ return round(shap, 4)
@@ -2,6 +2,7 @@ import logging
2
2
  from logging import Logger
3
3
  from typing import Dict, List, Optional, Tuple
4
4
 
5
+ import numpy as np
5
6
  import pandas as pd
6
7
  from pandas.api.types import is_integer_dtype, is_object_dtype, is_string_dtype
7
8
 
@@ -83,10 +84,21 @@ class FeaturesValidator:
83
84
  return [
84
85
  i
85
86
  for i in df
86
- if (is_object_dtype(df[i]) or is_string_dtype(df[i]) or is_integer_dtype(df[i]))
87
+ if (is_object_dtype(df[i]) or is_string_dtype(df[i]) or FeaturesValidator.__is_integer(df[i]))
87
88
  and (df[i].nunique(dropna=False) / row_count >= 0.85)
88
89
  ]
89
90
 
91
+ @staticmethod
92
+ def __is_integer(series: pd.Series) -> bool:
93
+ return (
94
+ is_integer_dtype(series)
95
+ or series.dropna()
96
+ .apply(
97
+ lambda f: (float.is_integer(f) and abs(f) < np.iinfo(np.int64).max) if isinstance(f, float) else False
98
+ )
99
+ .all()
100
+ )
101
+
90
102
  @staticmethod
91
103
  def find_constant_features(df: pd.DataFrame) -> List[str]:
92
104
  return [i for i in df if df[i].nunique() <= 1]
@@ -1 +0,0 @@
1
- __version__ = "1.2.28"
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes