upgini 1.2.29__tar.gz → 1.2.29a2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. {upgini-1.2.29 → upgini-1.2.29a2}/PKG-INFO +1 -1
  2. upgini-1.2.29a2/src/upgini/__about__.py +1 -0
  3. {upgini-1.2.29 → upgini-1.2.29a2}/src/upgini/features_enricher.py +130 -98
  4. {upgini-1.2.29 → upgini-1.2.29a2}/src/upgini/http.py +1 -1
  5. {upgini-1.2.29 → upgini-1.2.29a2}/src/upgini/metadata.py +4 -4
  6. {upgini-1.2.29 → upgini-1.2.29a2}/src/upgini/metrics.py +9 -33
  7. {upgini-1.2.29 → upgini-1.2.29a2}/src/upgini/utils/datetime_utils.py +7 -10
  8. {upgini-1.2.29 → upgini-1.2.29a2}/src/upgini/utils/email_utils.py +2 -3
  9. {upgini-1.2.29 → upgini-1.2.29a2}/src/upgini/utils/features_validator.py +1 -13
  10. upgini-1.2.29/src/upgini/__about__.py +0 -1
  11. upgini-1.2.29/src/upgini/utils/feature_info.py +0 -172
  12. {upgini-1.2.29 → upgini-1.2.29a2}/.gitignore +0 -0
  13. {upgini-1.2.29 → upgini-1.2.29a2}/LICENSE +0 -0
  14. {upgini-1.2.29 → upgini-1.2.29a2}/README.md +0 -0
  15. {upgini-1.2.29 → upgini-1.2.29a2}/pyproject.toml +0 -0
  16. {upgini-1.2.29 → upgini-1.2.29a2}/src/upgini/__init__.py +0 -0
  17. {upgini-1.2.29 → upgini-1.2.29a2}/src/upgini/ads.py +0 -0
  18. {upgini-1.2.29 → upgini-1.2.29a2}/src/upgini/ads_management/__init__.py +0 -0
  19. {upgini-1.2.29 → upgini-1.2.29a2}/src/upgini/ads_management/ads_manager.py +0 -0
  20. {upgini-1.2.29 → upgini-1.2.29a2}/src/upgini/autofe/__init__.py +0 -0
  21. {upgini-1.2.29 → upgini-1.2.29a2}/src/upgini/autofe/all_operands.py +0 -0
  22. {upgini-1.2.29 → upgini-1.2.29a2}/src/upgini/autofe/binary.py +0 -0
  23. {upgini-1.2.29 → upgini-1.2.29a2}/src/upgini/autofe/date.py +0 -0
  24. {upgini-1.2.29 → upgini-1.2.29a2}/src/upgini/autofe/feature.py +0 -0
  25. {upgini-1.2.29 → upgini-1.2.29a2}/src/upgini/autofe/groupby.py +0 -0
  26. {upgini-1.2.29 → upgini-1.2.29a2}/src/upgini/autofe/operand.py +0 -0
  27. {upgini-1.2.29 → upgini-1.2.29a2}/src/upgini/autofe/unary.py +0 -0
  28. {upgini-1.2.29 → upgini-1.2.29a2}/src/upgini/autofe/vector.py +0 -0
  29. {upgini-1.2.29 → upgini-1.2.29a2}/src/upgini/data_source/__init__.py +0 -0
  30. {upgini-1.2.29 → upgini-1.2.29a2}/src/upgini/data_source/data_source_publisher.py +0 -0
  31. {upgini-1.2.29 → upgini-1.2.29a2}/src/upgini/dataset.py +0 -0
  32. {upgini-1.2.29 → upgini-1.2.29a2}/src/upgini/errors.py +0 -0
  33. {upgini-1.2.29 → upgini-1.2.29a2}/src/upgini/lazy_import.py +0 -0
  34. {upgini-1.2.29 → upgini-1.2.29a2}/src/upgini/mdc/__init__.py +0 -0
  35. {upgini-1.2.29 → upgini-1.2.29a2}/src/upgini/mdc/context.py +0 -0
  36. {upgini-1.2.29 → upgini-1.2.29a2}/src/upgini/normalizer/__init__.py +0 -0
  37. {upgini-1.2.29 → upgini-1.2.29a2}/src/upgini/normalizer/normalize_utils.py +0 -0
  38. {upgini-1.2.29 → upgini-1.2.29a2}/src/upgini/resource_bundle/__init__.py +0 -0
  39. {upgini-1.2.29 → upgini-1.2.29a2}/src/upgini/resource_bundle/exceptions.py +0 -0
  40. {upgini-1.2.29 → upgini-1.2.29a2}/src/upgini/resource_bundle/strings.properties +0 -0
  41. {upgini-1.2.29 → upgini-1.2.29a2}/src/upgini/resource_bundle/strings_widget.properties +0 -0
  42. {upgini-1.2.29 → upgini-1.2.29a2}/src/upgini/sampler/__init__.py +0 -0
  43. {upgini-1.2.29 → upgini-1.2.29a2}/src/upgini/sampler/base.py +0 -0
  44. {upgini-1.2.29 → upgini-1.2.29a2}/src/upgini/sampler/random_under_sampler.py +0 -0
  45. {upgini-1.2.29 → upgini-1.2.29a2}/src/upgini/sampler/utils.py +0 -0
  46. {upgini-1.2.29 → upgini-1.2.29a2}/src/upgini/search_task.py +0 -0
  47. {upgini-1.2.29 → upgini-1.2.29a2}/src/upgini/spinner.py +0 -0
  48. {upgini-1.2.29 → upgini-1.2.29a2}/src/upgini/utils/Roboto-Regular.ttf +0 -0
  49. {upgini-1.2.29 → upgini-1.2.29a2}/src/upgini/utils/__init__.py +0 -0
  50. {upgini-1.2.29 → upgini-1.2.29a2}/src/upgini/utils/base_search_key_detector.py +0 -0
  51. {upgini-1.2.29 → upgini-1.2.29a2}/src/upgini/utils/blocked_time_series.py +0 -0
  52. {upgini-1.2.29 → upgini-1.2.29a2}/src/upgini/utils/country_utils.py +0 -0
  53. {upgini-1.2.29 → upgini-1.2.29a2}/src/upgini/utils/custom_loss_utils.py +0 -0
  54. {upgini-1.2.29 → upgini-1.2.29a2}/src/upgini/utils/cv_utils.py +0 -0
  55. {upgini-1.2.29 → upgini-1.2.29a2}/src/upgini/utils/deduplicate_utils.py +0 -0
  56. {upgini-1.2.29 → upgini-1.2.29a2}/src/upgini/utils/display_utils.py +0 -0
  57. {upgini-1.2.29 → upgini-1.2.29a2}/src/upgini/utils/fallback_progress_bar.py +0 -0
  58. {upgini-1.2.29 → upgini-1.2.29a2}/src/upgini/utils/format.py +0 -0
  59. {upgini-1.2.29 → upgini-1.2.29a2}/src/upgini/utils/ip_utils.py +0 -0
  60. {upgini-1.2.29 → upgini-1.2.29a2}/src/upgini/utils/phone_utils.py +0 -0
  61. {upgini-1.2.29 → upgini-1.2.29a2}/src/upgini/utils/postal_code_utils.py +0 -0
  62. {upgini-1.2.29 → upgini-1.2.29a2}/src/upgini/utils/progress_bar.py +0 -0
  63. {upgini-1.2.29 → upgini-1.2.29a2}/src/upgini/utils/sklearn_ext.py +0 -0
  64. {upgini-1.2.29 → upgini-1.2.29a2}/src/upgini/utils/target_utils.py +0 -0
  65. {upgini-1.2.29 → upgini-1.2.29a2}/src/upgini/utils/track_info.py +0 -0
  66. {upgini-1.2.29 → upgini-1.2.29a2}/src/upgini/utils/warning_counter.py +0 -0
  67. {upgini-1.2.29 → upgini-1.2.29a2}/src/upgini/version_validator.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: upgini
3
- Version: 1.2.29
3
+ Version: 1.2.29a2
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -0,0 +1 @@
1
+ __version__ = "1.2.29a2"
@@ -54,7 +54,6 @@ from upgini.metadata import (
54
54
  SYSTEM_RECORD_ID,
55
55
  TARGET,
56
56
  CVType,
57
- FeaturesMetadataV2,
58
57
  FileColumnMeaningType,
59
58
  ModelTaskType,
60
59
  RuntimeParameters,
@@ -96,7 +95,6 @@ from upgini.utils.email_utils import (
96
95
  EmailSearchKeyConverter,
97
96
  EmailSearchKeyDetector,
98
97
  )
99
- from upgini.utils.feature_info import FeatureInfo, _round_shap_value
100
98
  from upgini.utils.features_validator import FeaturesValidator
101
99
  from upgini.utils.format import Format
102
100
  from upgini.utils.ip_utils import IpSearchKeyConverter
@@ -160,10 +158,6 @@ class FeaturesEnricher(TransformerMixin):
160
158
 
161
159
  shared_datasets: list of str, optional (default=None)
162
160
  List of private shared dataset ids for custom search
163
-
164
- select_features: bool, optional (default=False)
165
- If True, return only selected features both from input and data sources.
166
- Otherwise, return all features from input and only selected features from data sources.
167
161
  """
168
162
 
169
163
  TARGET_NAME = "target"
@@ -230,7 +224,6 @@ class FeaturesEnricher(TransformerMixin):
230
224
  client_visitorid: Optional[str] = None,
231
225
  custom_bundle_config: Optional[str] = None,
232
226
  add_date_if_missing: bool = True,
233
- select_features: bool = False,
234
227
  **kwargs,
235
228
  ):
236
229
  self.bundle = get_custom_bundle(custom_bundle_config)
@@ -284,11 +277,8 @@ class FeaturesEnricher(TransformerMixin):
284
277
  self._relevant_data_sources_wo_links: pd.DataFrame = self.EMPTY_DATA_SOURCES
285
278
  self.metrics: Optional[pd.DataFrame] = None
286
279
  self.feature_names_ = []
287
- self.dropped_client_feature_names_ = []
288
280
  self.feature_importances_ = []
289
281
  self.search_id = search_id
290
- self.select_features = select_features
291
-
292
282
  if search_id:
293
283
  search_task = SearchTask(search_id, rest_client=self.rest_client, logger=self.logger)
294
284
 
@@ -1009,10 +999,9 @@ class FeaturesEnricher(TransformerMixin):
1009
999
  text_features=self.generate_features,
1010
1000
  has_date=has_date,
1011
1001
  )
1012
- etalon_cv_result = baseline_estimator.cross_val_predict(
1002
+ etalon_metric, _ = baseline_estimator.cross_val_predict(
1013
1003
  fitting_X, y_sorted, self.baseline_score_column
1014
1004
  )
1015
- etalon_metric = etalon_cv_result.get_display_metric()
1016
1005
  if etalon_metric is None:
1017
1006
  self.logger.info(
1018
1007
  f"Baseline {metric} on train client features is None (maybe all features was removed)"
@@ -1044,9 +1033,9 @@ class FeaturesEnricher(TransformerMixin):
1044
1033
  text_features=self.generate_features,
1045
1034
  has_date=has_date,
1046
1035
  )
1047
- enriched_cv_result = enriched_estimator.cross_val_predict(fitting_enriched_X, enriched_y_sorted)
1048
- enriched_metric = enriched_cv_result.get_display_metric()
1049
- enriched_shaps = enriched_cv_result.shap_values
1036
+ enriched_metric, enriched_shaps = enriched_estimator.cross_val_predict(
1037
+ fitting_enriched_X, enriched_y_sorted
1038
+ )
1050
1039
 
1051
1040
  if enriched_shaps is not None:
1052
1041
  self._update_shap_values(enriched_shaps)
@@ -1059,7 +1048,7 @@ class FeaturesEnricher(TransformerMixin):
1059
1048
  else:
1060
1049
  self.logger.info(f"Enriched {metric} on train combined features: {enriched_metric}")
1061
1050
  if etalon_metric is not None and enriched_metric is not None:
1062
- uplift = (enriched_cv_result.metric - etalon_cv_result.metric) * multiplier
1051
+ uplift = (enriched_metric - etalon_metric) * multiplier
1063
1052
 
1064
1053
  train_metrics = {
1065
1054
  self.bundle.get("quality_metrics_segment_header"): self.bundle.get(
@@ -1102,10 +1091,9 @@ class FeaturesEnricher(TransformerMixin):
1102
1091
  f"Calculate baseline {metric} on eval set {idx + 1} "
1103
1092
  f"on client features: {eval_X_sorted.columns.to_list()}"
1104
1093
  )
1105
- etalon_eval_results = baseline_estimator.calculate_metric(
1094
+ etalon_eval_metric = baseline_estimator.calculate_metric(
1106
1095
  eval_X_sorted, eval_y_sorted, self.baseline_score_column
1107
1096
  )
1108
- etalon_eval_metric = etalon_eval_results.get_display_metric()
1109
1097
  self.logger.info(
1110
1098
  f"Baseline {metric} on eval set {idx + 1} client features: {etalon_eval_metric}"
1111
1099
  )
@@ -1117,10 +1105,9 @@ class FeaturesEnricher(TransformerMixin):
1117
1105
  f"Calculate enriched {metric} on eval set {idx + 1} "
1118
1106
  f"on combined features: {enriched_eval_X_sorted.columns.to_list()}"
1119
1107
  )
1120
- enriched_eval_results = enriched_estimator.calculate_metric(
1108
+ enriched_eval_metric = enriched_estimator.calculate_metric(
1121
1109
  enriched_eval_X_sorted, enriched_eval_y_sorted
1122
1110
  )
1123
- enriched_eval_metric = enriched_eval_results.get_display_metric()
1124
1111
  self.logger.info(
1125
1112
  f"Enriched {metric} on eval set {idx + 1} combined features: {enriched_eval_metric}"
1126
1113
  )
@@ -1128,7 +1115,7 @@ class FeaturesEnricher(TransformerMixin):
1128
1115
  enriched_eval_metric = None
1129
1116
 
1130
1117
  if etalon_eval_metric is not None and enriched_eval_metric is not None:
1131
- eval_uplift = (enriched_eval_results.metric - etalon_eval_results.metric) * multiplier
1118
+ eval_uplift = (enriched_eval_metric - etalon_eval_metric) * multiplier
1132
1119
  else:
1133
1120
  eval_uplift = None
1134
1121
 
@@ -1211,7 +1198,9 @@ class FeaturesEnricher(TransformerMixin):
1211
1198
 
1212
1199
  def _update_shap_values(self, new_shaps: Dict[str, float]):
1213
1200
  new_shaps = {
1214
- feature: _round_shap_value(shap) for feature, shap in new_shaps.items() if feature in self.feature_names_
1201
+ feature: self._round_shap_value(shap)
1202
+ for feature, shap in new_shaps.items()
1203
+ if feature in self.feature_names_
1215
1204
  }
1216
1205
  features_importances = list(new_shaps.items())
1217
1206
  features_importances.sort(key=lambda m: (-m[1], m[0]))
@@ -1260,7 +1249,7 @@ class FeaturesEnricher(TransformerMixin):
1260
1249
  display_html_dataframe(
1261
1250
  self.relevant_data_sources,
1262
1251
  self._relevant_data_sources_wo_links,
1263
- self.bundle.get("relevant_data_sources_header"),
1252
+ self.bundle.get("relevant_features_header"),
1264
1253
  display_handle=self.data_sources_display_handle,
1265
1254
  )
1266
1255
  except (ImportError, NameError):
@@ -1448,12 +1437,7 @@ class FeaturesEnricher(TransformerMixin):
1448
1437
  client_features = [
1449
1438
  c
1450
1439
  for c in X_sampled.columns.to_list()
1451
- if (
1452
- not self.select_features
1453
- or c in self.feature_names_
1454
- or (self.fit_columns_renaming is not None and self.fit_columns_renaming.get(c) in self.feature_names_)
1455
- )
1456
- and c
1440
+ if c
1457
1441
  not in (
1458
1442
  excluding_search_keys
1459
1443
  + list(self.fit_dropped_features)
@@ -1669,10 +1653,7 @@ class FeaturesEnricher(TransformerMixin):
1669
1653
  generated_features = []
1670
1654
  if date_column is not None:
1671
1655
  converter = DateTimeSearchKeyConverter(date_column, self.date_format, self.logger, self.bundle)
1672
- # Leave original date column values
1673
- df_with_date_features = converter.convert(df, keep_time=True)
1674
- df_with_date_features[date_column] = df[date_column]
1675
- df = df_with_date_features
1656
+ df = converter.convert(df, keep_time=True)
1676
1657
  generated_features = converter.generated_features
1677
1658
 
1678
1659
  email_columns = SearchKey.find_all_keys(search_keys, SearchKey.EMAIL)
@@ -1681,10 +1662,9 @@ class FeaturesEnricher(TransformerMixin):
1681
1662
  df = generator.generate(df)
1682
1663
  generated_features.extend(generator.generated_features)
1683
1664
 
1684
- # normalizer = Normalizer(self.bundle, self.logger)
1685
- # df, search_keys, generated_features = normalizer.normalize(df, search_keys, generated_features)
1686
- # columns_renaming = normalizer.columns_renaming
1687
- columns_renaming = {c: c for c in df.columns}
1665
+ normalizer = Normalizer(self.bundle, self.logger)
1666
+ df, search_keys, generated_features = normalizer.normalize(df, search_keys, generated_features)
1667
+ columns_renaming = normalizer.columns_renaming
1688
1668
 
1689
1669
  df, _ = clean_full_duplicates(df, logger=self.logger, bundle=self.bundle)
1690
1670
 
@@ -2000,19 +1980,9 @@ class FeaturesEnricher(TransformerMixin):
2000
1980
  file_metadata = self._search_task.get_file_metadata(str(uuid.uuid4()))
2001
1981
  search_keys = file_metadata.search_types()
2002
1982
  if SearchKey.IPV6_ADDRESS in search_keys:
2003
- # search_keys.remove(SearchKey.IPV6_ADDRESS)
2004
- search_keys.pop(SearchKey.IPV6_ADDRESS, None)
1983
+ search_keys.remove(SearchKey.IPV6_ADDRESS)
2005
1984
 
2006
- keys = (
2007
- "{"
2008
- + ", ".join(
2009
- [
2010
- f'"{key.name}": {{"name": "{name}", "value": "{key_example(key)}"}}'
2011
- for key, name in search_keys.items()
2012
- ]
2013
- )
2014
- + "}"
2015
- )
1985
+ keys = "{" + ", ".join([f'"{key.name}": "{key_example(key)}"' for key in search_keys]) + "}"
2016
1986
  features_for_transform = self._search_task.get_features_for_transform()
2017
1987
  if features_for_transform:
2018
1988
  original_features_for_transform = [
@@ -2093,9 +2063,7 @@ class FeaturesEnricher(TransformerMixin):
2093
2063
 
2094
2064
  is_demo_dataset = hash_input(validated_X) in DEMO_DATASET_HASHES
2095
2065
 
2096
- columns_to_drop = [
2097
- c for c in validated_X.columns if c in self.feature_names_ and c in self.dropped_client_feature_names_
2098
- ]
2066
+ columns_to_drop = [c for c in validated_X.columns if c in self.feature_names_]
2099
2067
  if len(columns_to_drop) > 0:
2100
2068
  msg = self.bundle.get("x_contains_enriching_columns").format(columns_to_drop)
2101
2069
  self.logger.warning(msg)
@@ -2124,7 +2092,7 @@ class FeaturesEnricher(TransformerMixin):
2124
2092
  date_column = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
2125
2093
  if date_column is not None:
2126
2094
  converter = DateTimeSearchKeyConverter(date_column, self.date_format, self.logger, bundle=self.bundle)
2127
- df = converter.convert(df, keep_time=True)
2095
+ df = converter.convert(df)
2128
2096
  self.logger.info(f"Date column after convertion: {df[date_column]}")
2129
2097
  generated_features.extend(converter.generated_features)
2130
2098
  else:
@@ -2219,12 +2187,11 @@ class FeaturesEnricher(TransformerMixin):
2219
2187
 
2220
2188
  if add_fit_system_record_id:
2221
2189
  df = self.__add_fit_system_record_id(df, search_keys, SYSTEM_RECORD_ID)
2190
+ if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
2191
+ df = df.drop(columns=DateTimeSearchKeyConverter.DATETIME_COL)
2222
2192
  df = df.rename(columns={SYSTEM_RECORD_ID: SORT_ID})
2223
2193
  features_not_to_pass.append(SORT_ID)
2224
2194
 
2225
- if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
2226
- df = df.drop(columns=DateTimeSearchKeyConverter.DATETIME_COL)
2227
-
2228
2195
  # search keys might be changed after explode
2229
2196
  columns_for_system_record_id = sorted(list(search_keys.keys()) + features_for_transform)
2230
2197
  df[SYSTEM_RECORD_ID] = pd.util.hash_pandas_object(df[columns_for_system_record_id], index=False).astype(
@@ -2243,7 +2210,7 @@ class FeaturesEnricher(TransformerMixin):
2243
2210
 
2244
2211
  combined_search_keys = combine_search_keys(search_keys.keys())
2245
2212
 
2246
- df_without_features = df.drop(columns=features_not_to_pass, errors="ignore")
2213
+ df_without_features = df.drop(columns=features_not_to_pass)
2247
2214
 
2248
2215
  df_without_features, full_duplicates_warning = clean_full_duplicates(
2249
2216
  df_without_features, self.logger, bundle=self.bundle
@@ -2352,15 +2319,11 @@ class FeaturesEnricher(TransformerMixin):
2352
2319
  else:
2353
2320
  result = enrich()
2354
2321
 
2355
- selecting_columns = [
2356
- c
2357
- for c in itertools.chain(validated_X.columns.tolist(), generated_features)
2358
- if c not in self.dropped_client_feature_names_
2359
- ]
2360
2322
  filtered_columns = self.__filtered_enriched_features(importance_threshold, max_features)
2361
- selecting_columns.extend(
2323
+ existing_filtered_columns = [
2362
2324
  c for c in filtered_columns if c in result.columns and c not in validated_X.columns
2363
- )
2325
+ ]
2326
+ selecting_columns = validated_X.columns.tolist() + generated_features + existing_filtered_columns
2364
2327
  if add_fit_system_record_id:
2365
2328
  selecting_columns.append(SORT_ID)
2366
2329
 
@@ -3527,7 +3490,15 @@ class FeaturesEnricher(TransformerMixin):
3527
3490
 
3528
3491
  return result_train, result_eval_sets
3529
3492
 
3493
+ @staticmethod
3494
+ def _round_shap_value(shap: float) -> float:
3495
+ if shap > 0.0 and shap < 0.0001:
3496
+ return 0.0001
3497
+ else:
3498
+ return round(shap, 4)
3499
+
3530
3500
  def __prepare_feature_importances(self, trace_id: str, x_columns: List[str], silent=False):
3501
+ llm_source = "LLM with external data augmentation"
3531
3502
  if self._search_task is None:
3532
3503
  raise NotFittedError(self.bundle.get("transform_unfitted_enricher"))
3533
3504
  features_meta = self._search_task.get_all_features_metadata_v2()
@@ -3538,40 +3509,116 @@ class FeaturesEnricher(TransformerMixin):
3538
3509
  features_df = self._search_task.get_all_initial_raw_features(trace_id, metrics_calculation=True)
3539
3510
 
3540
3511
  self.feature_names_ = []
3541
- self.dropped_client_feature_names_ = []
3542
3512
  self.feature_importances_ = []
3543
3513
  features_info = []
3544
3514
  features_info_without_links = []
3545
3515
  internal_features_info = []
3546
3516
 
3517
+ def list_or_single(lst: List[str], single: str):
3518
+ return lst or ([single] if single else [])
3519
+
3520
+ def to_anchor(link: str, value: str) -> str:
3521
+ if not value:
3522
+ return ""
3523
+ elif not link:
3524
+ return value
3525
+ elif value == llm_source:
3526
+ return value
3527
+ else:
3528
+ return f"<a href='{link}' target='_blank' rel='noopener noreferrer'>{value}</a>"
3529
+
3530
+ def make_links(names: List[str], links: List[str]):
3531
+ all_links = [to_anchor(link, name) for name, link in itertools.zip_longest(names, links)]
3532
+ return ",".join(all_links)
3533
+
3547
3534
  features_meta.sort(key=lambda m: (-m.shap_value, m.name))
3548
3535
  for feature_meta in features_meta:
3549
3536
  if feature_meta.name in original_names_dict.keys():
3550
3537
  feature_meta.name = original_names_dict[feature_meta.name]
3551
-
3552
- is_client_feature = feature_meta.name in x_columns
3553
-
3554
- if feature_meta.shap_value == 0.0:
3555
- if self.select_features:
3556
- self.dropped_client_feature_names_.append(feature_meta.name)
3557
- continue
3558
-
3559
- # Use only important features
3538
+ # Use only enriched features
3560
3539
  if (
3561
- feature_meta.name in self.fit_generated_features
3540
+ feature_meta.name in x_columns
3562
3541
  or feature_meta.name == COUNTRY
3563
- # In select_features mode we select also from etalon features and need to show them
3564
- or (not self.select_features and is_client_feature)
3542
+ or feature_meta.shap_value == 0.0
3543
+ or feature_meta.name in self.fit_generated_features
3565
3544
  ):
3566
3545
  continue
3567
3546
 
3547
+ feature_sample = []
3568
3548
  self.feature_names_.append(feature_meta.name)
3569
- self.feature_importances_.append(_round_shap_value(feature_meta.shap_value))
3549
+ self.feature_importances_.append(self._round_shap_value(feature_meta.shap_value))
3550
+ if feature_meta.name in features_df.columns:
3551
+ feature_sample = np.random.choice(features_df[feature_meta.name].dropna().unique(), 3).tolist()
3552
+ if len(feature_sample) > 0 and isinstance(feature_sample[0], float):
3553
+ feature_sample = [round(f, 4) for f in feature_sample]
3554
+ feature_sample = [str(f) for f in feature_sample]
3555
+ feature_sample = ", ".join(feature_sample)
3556
+ if len(feature_sample) > 30:
3557
+ feature_sample = feature_sample[:30] + "..."
3558
+
3559
+ internal_provider = feature_meta.data_provider or "Upgini"
3560
+ providers = list_or_single(feature_meta.data_providers, feature_meta.data_provider)
3561
+ provider_links = list_or_single(feature_meta.data_provider_links, feature_meta.data_provider_link)
3562
+ if providers:
3563
+ provider = make_links(providers, provider_links)
3564
+ else:
3565
+ provider = to_anchor("https://upgini.com", "Upgini")
3570
3566
 
3571
- feature_info = FeatureInfo.from_metadata(feature_meta, features_df, is_client_feature)
3572
- features_info.append(feature_info.to_row(self.bundle))
3573
- features_info_without_links.append(feature_info.to_row_without_links(self.bundle))
3574
- internal_features_info.append(feature_info.to_internal_row(self.bundle))
3567
+ internal_source = feature_meta.data_source or (
3568
+ llm_source
3569
+ if not feature_meta.name.endswith("_country") and not feature_meta.name.endswith("_postal_code")
3570
+ else ""
3571
+ )
3572
+ sources = list_or_single(feature_meta.data_sources, feature_meta.data_source)
3573
+ source_links = list_or_single(feature_meta.data_source_links, feature_meta.data_source_link)
3574
+ if sources:
3575
+ source = make_links(sources, source_links)
3576
+ else:
3577
+ source = internal_source
3578
+
3579
+ internal_feature_name = feature_meta.name
3580
+ if feature_meta.doc_link:
3581
+ feature_name = to_anchor(feature_meta.doc_link, feature_meta.name)
3582
+ else:
3583
+ feature_name = internal_feature_name
3584
+
3585
+ features_info.append(
3586
+ {
3587
+ self.bundle.get("features_info_name"): feature_name,
3588
+ self.bundle.get("features_info_shap"): self._round_shap_value(feature_meta.shap_value),
3589
+ self.bundle.get("features_info_hitrate"): feature_meta.hit_rate,
3590
+ self.bundle.get("features_info_value_preview"): feature_sample,
3591
+ self.bundle.get("features_info_provider"): provider,
3592
+ self.bundle.get("features_info_source"): source,
3593
+ self.bundle.get("features_info_update_frequency"): feature_meta.update_frequency,
3594
+ }
3595
+ )
3596
+ features_info_without_links.append(
3597
+ {
3598
+ self.bundle.get("features_info_name"): internal_feature_name,
3599
+ self.bundle.get("features_info_shap"): self._round_shap_value(feature_meta.shap_value),
3600
+ self.bundle.get("features_info_hitrate"): feature_meta.hit_rate,
3601
+ self.bundle.get("features_info_value_preview"): feature_sample,
3602
+ self.bundle.get("features_info_provider"): internal_provider,
3603
+ self.bundle.get("features_info_source"): internal_source,
3604
+ self.bundle.get("features_info_update_frequency"): feature_meta.update_frequency,
3605
+ }
3606
+ )
3607
+ internal_features_info.append(
3608
+ {
3609
+ self.bundle.get("features_info_name"): internal_feature_name,
3610
+ "feature_link": feature_meta.doc_link,
3611
+ self.bundle.get("features_info_shap"): self._round_shap_value(feature_meta.shap_value),
3612
+ self.bundle.get("features_info_hitrate"): feature_meta.hit_rate,
3613
+ self.bundle.get("features_info_value_preview"): feature_sample,
3614
+ self.bundle.get("features_info_provider"): internal_provider,
3615
+ "provider_link": feature_meta.data_provider_link,
3616
+ self.bundle.get("features_info_source"): internal_source,
3617
+ "source_link": feature_meta.data_source_link,
3618
+ self.bundle.get("features_info_commercial_schema"): feature_meta.commercial_schema or "",
3619
+ self.bundle.get("features_info_update_frequency"): feature_meta.update_frequency,
3620
+ }
3621
+ )
3575
3622
 
3576
3623
  if len(features_info) > 0:
3577
3624
  self.features_info = pd.DataFrame(features_info)
@@ -3596,22 +3643,7 @@ class FeaturesEnricher(TransformerMixin):
3596
3643
  autofe_meta = self._search_task.get_autofe_metadata()
3597
3644
  if autofe_meta is None:
3598
3645
  return None
3599
- if len(self._internal_features_info) != 0:
3600
-
3601
- def to_feature_meta(row):
3602
- fm = FeaturesMetadataV2(
3603
- name=row[bundle.get("features_info_name")],
3604
- type="",
3605
- source="",
3606
- hit_rate=bundle.get("features_info_hitrate"),
3607
- shap_value=bundle.get("features_info_shap"),
3608
- data_source=bundle.get("features_info_source"),
3609
- )
3610
- return fm
3611
-
3612
- features_meta = self._internal_features_info.apply(to_feature_meta).to_list()
3613
- else:
3614
- features_meta = self._search_task.get_all_features_metadata_v2()
3646
+ features_meta = self._search_task.get_all_features_metadata_v2()
3615
3647
 
3616
3648
  def get_feature_by_name(name: str):
3617
3649
  for m in features_meta:
@@ -882,7 +882,7 @@ class _RestClient:
882
882
  if content_type:
883
883
  headers[_RestClient.CONTENT_TYPE_HEADER_NAME] = content_type
884
884
  if trace_id:
885
- headers[_RestClient.TRACE_ID_HEADER_NAME] = str(trace_id)
885
+ headers[_RestClient.TRACE_ID_HEADER_NAME] = trace_id
886
886
  for header_key, header_value in additional_headers.items():
887
887
  headers[header_key] = header_value
888
888
  return headers
@@ -1,7 +1,7 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  from enum import Enum
4
- from typing import Any, Dict, List, Optional, Union
4
+ from typing import Any, Dict, List, Optional, Set, Union
5
5
 
6
6
  from pydantic import BaseModel
7
7
 
@@ -228,13 +228,13 @@ class FileMetadata(BaseModel):
228
228
  return c
229
229
  return None
230
230
 
231
- def search_types(self) -> Dict[SearchKey, str]:
232
- search_keys = dict()
231
+ def search_types(self) -> Set[SearchKey]:
232
+ search_keys = set()
233
233
  for keys_group in self.searchKeys:
234
234
  for key in keys_group:
235
235
  column = self.column_by_name(key)
236
236
  if column:
237
- search_keys[SearchKey.from_meaning_type(column.meaningType)] = column.name
237
+ search_keys.add(SearchKey.from_meaning_type(column.meaningType))
238
238
  return search_keys
239
239
 
240
240
 
@@ -1,6 +1,5 @@
1
1
  from __future__ import annotations
2
2
 
3
- from dataclasses import dataclass
4
3
  import inspect
5
4
  import logging
6
5
  import re
@@ -211,21 +210,6 @@ SUPPORTED_CATBOOST_METRICS = {
211
210
  }
212
211
 
213
212
 
214
- @dataclass
215
- class _CrossValResults:
216
- metric: Optional[float]
217
- metric_std: Optional[float]
218
- shap_values: Optional[Dict[str, float]]
219
-
220
- def get_display_metric(self) -> Optional[str]:
221
- if self.metric is None:
222
- return None
223
- elif self.metric_std is None:
224
- return f"{self.metric:.3f}"
225
- else:
226
- return f"{self.metric:.3f} ± {self.metric_std:.3f}"
227
-
228
-
229
213
  class EstimatorWrapper:
230
214
  def __init__(
231
215
  self,
@@ -313,11 +297,11 @@ class EstimatorWrapper:
313
297
 
314
298
  def cross_val_predict(
315
299
  self, x: pd.DataFrame, y: np.ndarray, baseline_score_column: Optional[Any] = None
316
- ) -> _CrossValResults:
300
+ ) -> Tuple[Optional[float], Optional[Dict[str, float]]]:
317
301
  x, y, groups, fit_params = self._prepare_to_fit(x, y)
318
302
 
319
303
  if x.shape[1] == 0:
320
- return _CrossValResults(metric=None, metric_std=None, shap_values=None)
304
+ return None
321
305
 
322
306
  scorer = check_scoring(self.estimator, scoring=self.scorer)
323
307
 
@@ -342,7 +326,7 @@ class EstimatorWrapper:
342
326
 
343
327
  self.check_fold_metrics(metrics_by_fold)
344
328
 
345
- metric, metric_std = self._calculate_metric_from_folds(metrics_by_fold)
329
+ metric = np.mean(metrics_by_fold) * self.multiplier
346
330
 
347
331
  splits = self.cv.split(x, y, groups)
348
332
 
@@ -367,7 +351,7 @@ class EstimatorWrapper:
367
351
  else:
368
352
  average_shap_values = None
369
353
 
370
- return _CrossValResults(metric=metric, metric_std=metric_std, shap_values=average_shap_values)
354
+ return self.post_process_metric(metric), average_shap_values
371
355
 
372
356
  def process_shap_values(self, shap_values: Dict[str, float]) -> Dict[str, float]:
373
357
  return shap_values
@@ -383,25 +367,17 @@ class EstimatorWrapper:
383
367
  metric = 2 * metric - 1
384
368
  return metric
385
369
 
386
- def calculate_metric(
387
- self, x: pd.DataFrame, y: np.ndarray, baseline_score_column: Optional[Any] = None
388
- ) -> _CrossValResults:
370
+ def calculate_metric(self, x: pd.DataFrame, y: np.ndarray, baseline_score_column: Optional[Any] = None) -> float:
389
371
  x, y, _ = self._prepare_to_calculate(x, y)
390
372
  if baseline_score_column is not None and self.metric_name == "GINI":
391
- metric, metric_std = roc_auc_score(y, x[baseline_score_column]), None
373
+ metric = roc_auc_score(y, x[baseline_score_column])
392
374
  else:
393
375
  metrics = []
394
376
  for est in self.cv_estimators:
395
377
  metrics.append(self.scorer(est, x, y))
396
378
 
397
- metric, metric_std = self._calculate_metric_from_folds(metrics)
398
- return _CrossValResults(metric=metric, metric_std=metric_std, shap_values=None)
399
-
400
- def _calculate_metric_from_folds(self, metrics_by_fold: List[float]) -> Tuple[float, float]:
401
- metrics_by_fold = [self.post_process_metric(m) for m in metrics_by_fold]
402
- metric = np.mean(metrics_by_fold) * self.multiplier
403
- metric_std = np.std(metrics_by_fold) * np.abs(self.multiplier)
404
- return metric, metric_std
379
+ metric = np.mean(metrics) * self.multiplier
380
+ return self.post_process_metric(metric)
405
381
 
406
382
  @staticmethod
407
383
  def create(
@@ -615,7 +591,7 @@ class CatBoostWrapper(EstimatorWrapper):
615
591
 
616
592
  def cross_val_predict(
617
593
  self, x: pd.DataFrame, y: np.ndarray, baseline_score_column: Optional[Any] = None
618
- ) -> _CrossValResults:
594
+ ) -> Tuple[Optional[float], Optional[Dict[str, float]]]:
619
595
  try:
620
596
  return super().cross_val_predict(x, y, baseline_score_column)
621
597
  except Exception as e:
@@ -111,23 +111,21 @@ class DateTimeSearchKeyConverter:
111
111
 
112
112
  # Define function to apply sine and cosine transformations
113
113
  def add_cyclical_features(df, column, period):
114
- period_suffix = f"_{period}" if column != "day_in_quarter" else ""
114
+ period_suffix = f"_{period}" if column != 'day_in_quarter' else ""
115
115
  sin_feature = f"datetime_{column}_sin{period_suffix}"
116
116
  cos_feature = f"datetime_{column}_cos{period_suffix}"
117
- if sin_feature not in df.columns:
118
- df[sin_feature] = np.sin(2 * np.pi * df[column] / period)
119
- self.generated_features.append(sin_feature)
120
- if cos_feature not in df.columns:
121
- df[cos_feature] = np.cos(2 * np.pi * df[column] / period)
122
- self.generated_features.append(cos_feature)
117
+ df[sin_feature] = np.sin(2 * np.pi * df[column] / period)
118
+ df[cos_feature] = np.cos(2 * np.pi * df[column] / period)
119
+ self.generated_features.append(sin_feature)
120
+ self.generated_features.append(cos_feature)
123
121
 
124
122
  df["quarter"] = df[self.date_column].dt.quarter
125
123
 
126
124
  # Calculate the start date of the quarter for each timestamp
127
- df["quarter_start"] = df[self.date_column].dt.to_period("Q").dt.start_time
125
+ df["quarter_start"] = df["timestamp"].dt.to_period("Q").dt.start_time
128
126
 
129
127
  # Calculate the day in the quarter
130
- df["day_in_quarter"] = (df[self.date_column] - df["quarter_start"]).dt.days + 1
128
+ df["day_in_quarter"] = (df["timestamp"] - df["quarter_start"]).dt.days + 1
131
129
 
132
130
  # Vectorized calculation of days_in_quarter
133
131
  quarter = df["quarter"]
@@ -139,7 +137,6 @@ class DateTimeSearchKeyConverter:
139
137
  quarter_end_month = np.where(quarter == 4, 1, month + 3)
140
138
 
141
139
  end = pd.to_datetime({"year": quarter_end_year, "month": quarter_end_month, "day": 1})
142
- end.index = df.index
143
140
 
144
141
  df["days_in_quarter"] = (end - start).dt.days
145
142
 
@@ -38,9 +38,8 @@ class EmailDomainGenerator:
38
38
  def generate(self, df: pd.DataFrame) -> pd.DataFrame:
39
39
  for email_col in self.email_columns:
40
40
  domain_feature = email_col + self.DOMAIN_SUFFIX
41
- if domain_feature not in df.columns:
42
- df[domain_feature] = df[email_col].apply(self._email_to_domain).astype("string")
43
- self.generated_features.append(domain_feature)
41
+ df[domain_feature] = df[email_col].apply(self._email_to_domain)
42
+ self.generated_features.append(domain_feature)
44
43
  return df
45
44
 
46
45
  @staticmethod
@@ -2,7 +2,6 @@ import logging
2
2
  from logging import Logger
3
3
  from typing import Dict, List, Optional, Tuple
4
4
 
5
- import numpy as np
6
5
  import pandas as pd
7
6
  from pandas.api.types import is_integer_dtype, is_object_dtype, is_string_dtype
8
7
 
@@ -84,21 +83,10 @@ class FeaturesValidator:
84
83
  return [
85
84
  i
86
85
  for i in df
87
- if (is_object_dtype(df[i]) or is_string_dtype(df[i]) or FeaturesValidator.__is_integer(df[i]))
86
+ if (is_object_dtype(df[i]) or is_string_dtype(df[i]) or is_integer_dtype(df[i]))
88
87
  and (df[i].nunique(dropna=False) / row_count >= 0.85)
89
88
  ]
90
89
 
91
- @staticmethod
92
- def __is_integer(series: pd.Series) -> bool:
93
- return (
94
- is_integer_dtype(series)
95
- or series.dropna()
96
- .apply(
97
- lambda f: (float.is_integer(f) and abs(f) < np.iinfo(np.int64).max) if isinstance(f, float) else False
98
- )
99
- .all()
100
- )
101
-
102
90
  @staticmethod
103
91
  def find_constant_features(df: pd.DataFrame) -> List[str]:
104
92
  return [i for i in df if df[i].nunique() <= 1]
@@ -1 +0,0 @@
1
- __version__ = "1.2.29"
@@ -1,172 +0,0 @@
1
- from dataclasses import dataclass
2
- import itertools
3
- from typing import Dict, List
4
-
5
- import numpy as np
6
- import pandas as pd
7
-
8
- from upgini.metadata import FeaturesMetadataV2
9
- from upgini.resource_bundle import ResourceBundle
10
-
11
-
12
- LLM_SOURCE = "LLM with external data augmentation"
13
-
14
-
15
- @dataclass
16
- class FeatureInfo:
17
- name: str
18
- internal_name: str
19
- rounded_shap: float
20
- hitrate: float
21
- value_preview: str
22
- provider: str
23
- internal_provider: str
24
- source: str
25
- internal_source: str
26
- update_frequency: str
27
- commercial_schema: str
28
- doc_link: str
29
- data_provider_link: str
30
- data_source_link: str
31
-
32
- @staticmethod
33
- def from_metadata(feature_meta: FeaturesMetadataV2, data: pd.DataFrame, is_client_feature: bool) -> "FeatureInfo":
34
- return FeatureInfo(
35
- name=_get_name(feature_meta),
36
- internal_name=_get_internal_name(feature_meta),
37
- rounded_shap=_round_shap_value(feature_meta.shap_value),
38
- hitrate=feature_meta.hit_rate,
39
- value_preview=_get_feature_sample(feature_meta, data),
40
- provider=_get_provider(feature_meta, is_client_feature),
41
- internal_provider=_get_internal_provider(feature_meta, is_client_feature),
42
- source=_get_source(feature_meta, is_client_feature),
43
- internal_source=_get_internal_source(feature_meta, is_client_feature),
44
- update_frequency=feature_meta.update_frequency,
45
- commercial_schema=feature_meta.commercial_schema,
46
- doc_link=feature_meta.doc_link,
47
- data_provider_link=feature_meta.data_provider_link,
48
- data_source_link=feature_meta.data_source_link,
49
- )
50
-
51
- def to_row(self, bundle: ResourceBundle) -> Dict[str, str]:
52
- return {
53
- bundle.get("features_info_name"): self.name,
54
- bundle.get("features_info_shap"): self.rounded_shap,
55
- bundle.get("features_info_hitrate"): self.hitrate,
56
- bundle.get("features_info_value_preview"): self.value_preview,
57
- bundle.get("features_info_provider"): self.provider,
58
- bundle.get("features_info_source"): self.source,
59
- bundle.get("features_info_update_frequency"): self.update_frequency,
60
- }
61
-
62
- def to_row_without_links(self, bundle: ResourceBundle) -> Dict[str, str]:
63
- return {
64
- bundle.get("features_info_name"): self.internal_name,
65
- bundle.get("features_info_shap"): self.rounded_shap,
66
- bundle.get("features_info_hitrate"): self.hitrate,
67
- bundle.get("features_info_value_preview"): self.value_preview,
68
- bundle.get("features_info_provider"): self.internal_provider,
69
- bundle.get("features_info_source"): self.internal_source,
70
- bundle.get("features_info_update_frequency"): self.update_frequency,
71
- }
72
-
73
- def to_internal_row(self, bundle: ResourceBundle) -> Dict[str, str]:
74
- return {
75
- bundle.get("features_info_name"): self.internal_name,
76
- "feature_link": self.doc_link,
77
- bundle.get("features_info_shap"): self.rounded_shap,
78
- bundle.get("features_info_hitrate"): self.hitrate,
79
- bundle.get("features_info_value_preview"): self.value_preview,
80
- bundle.get("features_info_provider"): self.internal_provider,
81
- "provider_link": self.data_provider_link,
82
- bundle.get("features_info_source"): self.internal_source,
83
- "source_link": self.data_source_link,
84
- bundle.get("features_info_commercial_schema"): self.commercial_schema or "",
85
- bundle.get("features_info_update_frequency"): self.update_frequency,
86
- }
87
-
88
-
89
- def _get_feature_sample(feature_meta: FeaturesMetadataV2, data: pd.DataFrame) -> str:
90
- if feature_meta.name in data.columns:
91
- feature_sample = np.random.choice(data[feature_meta.name].dropna().unique(), 3).tolist()
92
- if len(feature_sample) > 0 and isinstance(feature_sample[0], float):
93
- feature_sample = [round(f, 4) for f in feature_sample]
94
- feature_sample = [str(f) for f in feature_sample]
95
- feature_sample = ", ".join(feature_sample)
96
- if len(feature_sample) > 30:
97
- feature_sample = feature_sample[:30] + "..."
98
- else:
99
- feature_sample = ""
100
- return feature_sample
101
-
102
-
103
- def _get_name(feature_meta: FeaturesMetadataV2) -> str:
104
- if feature_meta.doc_link:
105
- return _to_anchor(feature_meta.doc_link, feature_meta.name)
106
- else:
107
- return feature_meta.name
108
-
109
-
110
- def _get_internal_name(feature_meta: FeaturesMetadataV2) -> str:
111
- return feature_meta.name
112
-
113
-
114
- def _get_provider(feature_meta: FeaturesMetadataV2, is_client_feature: bool) -> str:
115
- providers = _list_or_single(feature_meta.data_providers, feature_meta.data_provider)
116
- provider_links = _list_or_single(feature_meta.data_provider_links, feature_meta.data_provider_link)
117
- if providers:
118
- provider = _make_links(providers, provider_links)
119
- else:
120
- provider = "" if is_client_feature else _to_anchor("https://upgini.com", "Upgini")
121
- return provider
122
-
123
-
124
- def _get_internal_provider(feature_meta: FeaturesMetadataV2, is_client_feature: bool) -> str:
125
- return "" if is_client_feature else (feature_meta.data_provider or "Upgini")
126
-
127
-
128
- def _get_source(feature_meta: FeaturesMetadataV2, is_client_feature: bool) -> str:
129
- sources = _list_or_single(feature_meta.data_sources, feature_meta.data_source)
130
- source_links = _list_or_single(feature_meta.data_source_links, feature_meta.data_source_link)
131
- if sources:
132
- source = _make_links(sources, source_links)
133
- else:
134
- source = _get_internal_source(feature_meta, is_client_feature)
135
- return source
136
-
137
-
138
- def _get_internal_source(feature_meta: FeaturesMetadataV2, is_client_feature: bool) -> str:
139
- return feature_meta.data_source or (
140
- LLM_SOURCE
141
- if not feature_meta.name.endswith("_country")
142
- and not feature_meta.name.endswith("_postal_code")
143
- and not is_client_feature
144
- else ""
145
- )
146
-
147
-
148
- def _list_or_single(lst: List[str], single: str):
149
- return lst or ([single] if single else [])
150
-
151
-
152
- def _to_anchor(link: str, value: str) -> str:
153
- if not value:
154
- return ""
155
- elif not link:
156
- return value
157
- elif value == LLM_SOURCE:
158
- return value
159
- else:
160
- return f"<a href='{link}' target='_blank' rel='noopener noreferrer'>{value}</a>"
161
-
162
-
163
- def _make_links(names: List[str], links: List[str]):
164
- all_links = [_to_anchor(link, name) for name, link in itertools.zip_longest(names, links)]
165
- return ",".join(all_links)
166
-
167
-
168
- def _round_shap_value(shap: float) -> float:
169
- if shap > 0.0 and shap < 0.0001:
170
- return 0.0001
171
- else:
172
- return round(shap, 4)
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes