upgini 1.2.29a6__tar.gz → 1.2.30__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

Files changed (67) hide show
  1. {upgini-1.2.29a6 → upgini-1.2.30}/PKG-INFO +1 -1
  2. upgini-1.2.30/src/upgini/__about__.py +1 -0
  3. {upgini-1.2.29a6 → upgini-1.2.30}/src/upgini/features_enricher.py +97 -58
  4. {upgini-1.2.29a6 → upgini-1.2.30}/src/upgini/http.py +1 -1
  5. {upgini-1.2.29a6 → upgini-1.2.30}/src/upgini/metadata.py +4 -4
  6. {upgini-1.2.29a6 → upgini-1.2.30}/src/upgini/resource_bundle/strings.properties +8 -0
  7. {upgini-1.2.29a6 → upgini-1.2.30}/src/upgini/utils/datetime_utils.py +6 -4
  8. {upgini-1.2.29a6 → upgini-1.2.30}/src/upgini/utils/email_utils.py +3 -2
  9. {upgini-1.2.29a6 → upgini-1.2.30}/src/upgini/utils/features_validator.py +13 -1
  10. upgini-1.2.29a6/src/upgini/__about__.py +0 -1
  11. {upgini-1.2.29a6 → upgini-1.2.30}/.gitignore +0 -0
  12. {upgini-1.2.29a6 → upgini-1.2.30}/LICENSE +0 -0
  13. {upgini-1.2.29a6 → upgini-1.2.30}/README.md +0 -0
  14. {upgini-1.2.29a6 → upgini-1.2.30}/pyproject.toml +0 -0
  15. {upgini-1.2.29a6 → upgini-1.2.30}/src/upgini/__init__.py +0 -0
  16. {upgini-1.2.29a6 → upgini-1.2.30}/src/upgini/ads.py +0 -0
  17. {upgini-1.2.29a6 → upgini-1.2.30}/src/upgini/ads_management/__init__.py +0 -0
  18. {upgini-1.2.29a6 → upgini-1.2.30}/src/upgini/ads_management/ads_manager.py +0 -0
  19. {upgini-1.2.29a6 → upgini-1.2.30}/src/upgini/autofe/__init__.py +0 -0
  20. {upgini-1.2.29a6 → upgini-1.2.30}/src/upgini/autofe/all_operands.py +0 -0
  21. {upgini-1.2.29a6 → upgini-1.2.30}/src/upgini/autofe/binary.py +0 -0
  22. {upgini-1.2.29a6 → upgini-1.2.30}/src/upgini/autofe/date.py +0 -0
  23. {upgini-1.2.29a6 → upgini-1.2.30}/src/upgini/autofe/feature.py +0 -0
  24. {upgini-1.2.29a6 → upgini-1.2.30}/src/upgini/autofe/groupby.py +0 -0
  25. {upgini-1.2.29a6 → upgini-1.2.30}/src/upgini/autofe/operand.py +0 -0
  26. {upgini-1.2.29a6 → upgini-1.2.30}/src/upgini/autofe/unary.py +0 -0
  27. {upgini-1.2.29a6 → upgini-1.2.30}/src/upgini/autofe/vector.py +0 -0
  28. {upgini-1.2.29a6 → upgini-1.2.30}/src/upgini/data_source/__init__.py +0 -0
  29. {upgini-1.2.29a6 → upgini-1.2.30}/src/upgini/data_source/data_source_publisher.py +0 -0
  30. {upgini-1.2.29a6 → upgini-1.2.30}/src/upgini/dataset.py +0 -0
  31. {upgini-1.2.29a6 → upgini-1.2.30}/src/upgini/errors.py +0 -0
  32. {upgini-1.2.29a6 → upgini-1.2.30}/src/upgini/lazy_import.py +0 -0
  33. {upgini-1.2.29a6 → upgini-1.2.30}/src/upgini/mdc/__init__.py +0 -0
  34. {upgini-1.2.29a6 → upgini-1.2.30}/src/upgini/mdc/context.py +0 -0
  35. {upgini-1.2.29a6 → upgini-1.2.30}/src/upgini/metrics.py +0 -0
  36. {upgini-1.2.29a6 → upgini-1.2.30}/src/upgini/normalizer/__init__.py +0 -0
  37. {upgini-1.2.29a6 → upgini-1.2.30}/src/upgini/normalizer/normalize_utils.py +0 -0
  38. {upgini-1.2.29a6 → upgini-1.2.30}/src/upgini/resource_bundle/__init__.py +0 -0
  39. {upgini-1.2.29a6 → upgini-1.2.30}/src/upgini/resource_bundle/exceptions.py +0 -0
  40. {upgini-1.2.29a6 → upgini-1.2.30}/src/upgini/resource_bundle/strings_widget.properties +0 -0
  41. {upgini-1.2.29a6 → upgini-1.2.30}/src/upgini/sampler/__init__.py +0 -0
  42. {upgini-1.2.29a6 → upgini-1.2.30}/src/upgini/sampler/base.py +0 -0
  43. {upgini-1.2.29a6 → upgini-1.2.30}/src/upgini/sampler/random_under_sampler.py +0 -0
  44. {upgini-1.2.29a6 → upgini-1.2.30}/src/upgini/sampler/utils.py +0 -0
  45. {upgini-1.2.29a6 → upgini-1.2.30}/src/upgini/search_task.py +0 -0
  46. {upgini-1.2.29a6 → upgini-1.2.30}/src/upgini/spinner.py +0 -0
  47. {upgini-1.2.29a6 → upgini-1.2.30}/src/upgini/utils/Roboto-Regular.ttf +0 -0
  48. {upgini-1.2.29a6 → upgini-1.2.30}/src/upgini/utils/__init__.py +0 -0
  49. {upgini-1.2.29a6 → upgini-1.2.30}/src/upgini/utils/base_search_key_detector.py +0 -0
  50. {upgini-1.2.29a6 → upgini-1.2.30}/src/upgini/utils/blocked_time_series.py +0 -0
  51. {upgini-1.2.29a6 → upgini-1.2.30}/src/upgini/utils/country_utils.py +0 -0
  52. {upgini-1.2.29a6 → upgini-1.2.30}/src/upgini/utils/custom_loss_utils.py +0 -0
  53. {upgini-1.2.29a6 → upgini-1.2.30}/src/upgini/utils/cv_utils.py +0 -0
  54. {upgini-1.2.29a6 → upgini-1.2.30}/src/upgini/utils/deduplicate_utils.py +0 -0
  55. {upgini-1.2.29a6 → upgini-1.2.30}/src/upgini/utils/display_utils.py +0 -0
  56. {upgini-1.2.29a6 → upgini-1.2.30}/src/upgini/utils/fallback_progress_bar.py +0 -0
  57. {upgini-1.2.29a6 → upgini-1.2.30}/src/upgini/utils/feature_info.py +0 -0
  58. {upgini-1.2.29a6 → upgini-1.2.30}/src/upgini/utils/format.py +0 -0
  59. {upgini-1.2.29a6 → upgini-1.2.30}/src/upgini/utils/ip_utils.py +0 -0
  60. {upgini-1.2.29a6 → upgini-1.2.30}/src/upgini/utils/phone_utils.py +0 -0
  61. {upgini-1.2.29a6 → upgini-1.2.30}/src/upgini/utils/postal_code_utils.py +0 -0
  62. {upgini-1.2.29a6 → upgini-1.2.30}/src/upgini/utils/progress_bar.py +0 -0
  63. {upgini-1.2.29a6 → upgini-1.2.30}/src/upgini/utils/sklearn_ext.py +0 -0
  64. {upgini-1.2.29a6 → upgini-1.2.30}/src/upgini/utils/target_utils.py +0 -0
  65. {upgini-1.2.29a6 → upgini-1.2.30}/src/upgini/utils/track_info.py +0 -0
  66. {upgini-1.2.29a6 → upgini-1.2.30}/src/upgini/utils/warning_counter.py +0 -0
  67. {upgini-1.2.29a6 → upgini-1.2.30}/src/upgini/version_validator.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: upgini
3
- Version: 1.2.29a6
3
+ Version: 1.2.30
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -0,0 +1 @@
1
+ __version__ = "1.2.30"
@@ -350,6 +350,7 @@ class FeaturesEnricher(TransformerMixin):
350
350
  self.add_date_if_missing = add_date_if_missing
351
351
  self.features_info_display_handle = None
352
352
  self.data_sources_display_handle = None
353
+ self.autofe_features_display_handle = None
353
354
  self.report_button_handle = None
354
355
 
355
356
  def _get_api_key(self):
@@ -1049,7 +1050,7 @@ class FeaturesEnricher(TransformerMixin):
1049
1050
  enriched_shaps = enriched_cv_result.shap_values
1050
1051
 
1051
1052
  if enriched_shaps is not None:
1052
- self._update_shap_values(enriched_shaps)
1053
+ self._update_shap_values(trace_id, validated_X.columns.to_list(), enriched_shaps)
1053
1054
 
1054
1055
  if enriched_metric is None:
1055
1056
  self.logger.warning(
@@ -1209,37 +1210,11 @@ class FeaturesEnricher(TransformerMixin):
1209
1210
  finally:
1210
1211
  self.logger.info(f"Calculating metrics elapsed time: {time.time() - start_time}")
1211
1212
 
1212
- def _update_shap_values(self, new_shaps: Dict[str, float]):
1213
+ def _update_shap_values(self, trace_id: str, x_columns: List[str], new_shaps: Dict[str, float]):
1213
1214
  new_shaps = {
1214
1215
  feature: _round_shap_value(shap) for feature, shap in new_shaps.items() if feature in self.feature_names_
1215
1216
  }
1216
- features_importances = list(new_shaps.items())
1217
- features_importances.sort(key=lambda m: (-m[1], m[0]))
1218
- self.feature_names_, self.feature_importances_ = zip(*features_importances)
1219
- self.feature_names_ = list(self.feature_names_)
1220
- self.feature_importances_ = list(self.feature_importances_)
1221
-
1222
- feature_name_header = self.bundle.get("features_info_name")
1223
- shap_value_header = self.bundle.get("features_info_shap")
1224
-
1225
- def update_shap(row):
1226
- return new_shaps.get(row[feature_name_header], row[shap_value_header])
1227
-
1228
- self.features_info[shap_value_header] = self.features_info.apply(update_shap, axis=1)
1229
- self._internal_features_info[shap_value_header] = self._internal_features_info.apply(update_shap, axis=1)
1230
- self._features_info_without_links[shap_value_header] = self._features_info_without_links.apply(
1231
- update_shap, axis=1
1232
- )
1233
- self.logger.info(f"Recalculated SHAP values:\n{self._features_info_without_links}")
1234
-
1235
- self.features_info.sort_values(by=shap_value_header, ascending=False, inplace=True)
1236
- self._internal_features_info.sort_values(by=shap_value_header, ascending=False, inplace=True)
1237
- self._features_info_without_links.sort_values(by=shap_value_header, ascending=False, inplace=True)
1238
-
1239
- self.relevant_data_sources = self._group_relevant_data_sources(self.features_info, self.bundle)
1240
- self._relevant_data_sources_wo_links = self._group_relevant_data_sources(
1241
- self._features_info_without_links, self.bundle
1242
- )
1217
+ self.__prepare_feature_importances(trace_id, x_columns, new_shaps, silent=True)
1243
1218
 
1244
1219
  if self.features_info_display_handle is not None:
1245
1220
  try:
@@ -1252,7 +1227,7 @@ class FeaturesEnricher(TransformerMixin):
1252
1227
  display_handle=self.features_info_display_handle,
1253
1228
  )
1254
1229
  except (ImportError, NameError):
1255
- print(self._internal_features_info)
1230
+ pass
1256
1231
  if self.data_sources_display_handle is not None:
1257
1232
  try:
1258
1233
  _ = get_ipython() # type: ignore
@@ -1260,11 +1235,24 @@ class FeaturesEnricher(TransformerMixin):
1260
1235
  display_html_dataframe(
1261
1236
  self.relevant_data_sources,
1262
1237
  self._relevant_data_sources_wo_links,
1263
- self.bundle.get("relevant_features_header"),
1238
+ self.bundle.get("relevant_data_sources_header"),
1264
1239
  display_handle=self.data_sources_display_handle,
1265
1240
  )
1266
1241
  except (ImportError, NameError):
1267
- print(self._relevant_data_sources_wo_links)
1242
+ pass
1243
+ if self.autofe_features_display_handle is not None:
1244
+ try:
1245
+ _ = get_ipython() # type: ignore
1246
+ autofe_descriptions_df = self.get_autofe_features_description()
1247
+ if autofe_descriptions_df is not None:
1248
+ display_html_dataframe(
1249
+ df=autofe_descriptions_df,
1250
+ internal_df=autofe_descriptions_df,
1251
+ header=self.bundle.get("autofe_descriptions_header"),
1252
+ display_handle=self.autofe_features_display_handle,
1253
+ )
1254
+ except (ImportError, NameError):
1255
+ pass
1268
1256
  if self.report_button_handle is not None:
1269
1257
  try:
1270
1258
  _ = get_ipython() # type: ignore
@@ -1448,7 +1436,11 @@ class FeaturesEnricher(TransformerMixin):
1448
1436
  client_features = [
1449
1437
  c
1450
1438
  for c in X_sampled.columns.to_list()
1451
- if (not self.select_features or c in self.feature_names_)
1439
+ if (
1440
+ not self.select_features
1441
+ or c in self.feature_names_
1442
+ or (self.fit_columns_renaming is not None and self.fit_columns_renaming.get(c) in self.feature_names_)
1443
+ )
1452
1444
  and c
1453
1445
  not in (
1454
1446
  excluding_search_keys
@@ -1665,7 +1657,10 @@ class FeaturesEnricher(TransformerMixin):
1665
1657
  generated_features = []
1666
1658
  if date_column is not None:
1667
1659
  converter = DateTimeSearchKeyConverter(date_column, self.date_format, self.logger, self.bundle)
1668
- df = converter.convert(df, keep_time=True)
1660
+ # Leave original date column values
1661
+ df_with_date_features = converter.convert(df, keep_time=True)
1662
+ df_with_date_features[date_column] = df[date_column]
1663
+ df = df_with_date_features
1669
1664
  generated_features = converter.generated_features
1670
1665
 
1671
1666
  email_columns = SearchKey.find_all_keys(search_keys, SearchKey.EMAIL)
@@ -1674,9 +1669,10 @@ class FeaturesEnricher(TransformerMixin):
1674
1669
  df = generator.generate(df)
1675
1670
  generated_features.extend(generator.generated_features)
1676
1671
 
1677
- normalizer = Normalizer(self.bundle, self.logger)
1678
- df, search_keys, generated_features = normalizer.normalize(df, search_keys, generated_features)
1679
- columns_renaming = normalizer.columns_renaming
1672
+ # normalizer = Normalizer(self.bundle, self.logger)
1673
+ # df, search_keys, generated_features = normalizer.normalize(df, search_keys, generated_features)
1674
+ # columns_renaming = normalizer.columns_renaming
1675
+ columns_renaming = {c: c for c in df.columns}
1680
1676
 
1681
1677
  df, _ = clean_full_duplicates(df, logger=self.logger, bundle=self.bundle)
1682
1678
 
@@ -1992,9 +1988,19 @@ class FeaturesEnricher(TransformerMixin):
1992
1988
  file_metadata = self._search_task.get_file_metadata(str(uuid.uuid4()))
1993
1989
  search_keys = file_metadata.search_types()
1994
1990
  if SearchKey.IPV6_ADDRESS in search_keys:
1995
- search_keys.remove(SearchKey.IPV6_ADDRESS)
1991
+ # search_keys.remove(SearchKey.IPV6_ADDRESS)
1992
+ search_keys.pop(SearchKey.IPV6_ADDRESS, None)
1996
1993
 
1997
- keys = "{" + ", ".join([f'"{key.name}": "{key_example(key)}"' for key in search_keys]) + "}"
1994
+ keys = (
1995
+ "{"
1996
+ + ", ".join(
1997
+ [
1998
+ f'"{key.name}": {{"name": "{name}", "value": "{key_example(key)}"}}'
1999
+ for key, name in search_keys.items()
2000
+ ]
2001
+ )
2002
+ + "}"
2003
+ )
1998
2004
  features_for_transform = self._search_task.get_features_for_transform()
1999
2005
  if features_for_transform:
2000
2006
  original_features_for_transform = [
@@ -2106,7 +2112,7 @@ class FeaturesEnricher(TransformerMixin):
2106
2112
  date_column = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
2107
2113
  if date_column is not None:
2108
2114
  converter = DateTimeSearchKeyConverter(date_column, self.date_format, self.logger, bundle=self.bundle)
2109
- df = converter.convert(df)
2115
+ df = converter.convert(df, keep_time=True)
2110
2116
  self.logger.info(f"Date column after convertion: {df[date_column]}")
2111
2117
  generated_features.extend(converter.generated_features)
2112
2118
  else:
@@ -2201,11 +2207,12 @@ class FeaturesEnricher(TransformerMixin):
2201
2207
 
2202
2208
  if add_fit_system_record_id:
2203
2209
  df = self.__add_fit_system_record_id(df, search_keys, SYSTEM_RECORD_ID)
2204
- if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
2205
- df = df.drop(columns=DateTimeSearchKeyConverter.DATETIME_COL)
2206
2210
  df = df.rename(columns={SYSTEM_RECORD_ID: SORT_ID})
2207
2211
  features_not_to_pass.append(SORT_ID)
2208
2212
 
2213
+ if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
2214
+ df = df.drop(columns=DateTimeSearchKeyConverter.DATETIME_COL)
2215
+
2209
2216
  # search keys might be changed after explode
2210
2217
  columns_for_system_record_id = sorted(list(search_keys.keys()) + features_for_transform)
2211
2218
  df[SYSTEM_RECORD_ID] = pd.util.hash_pandas_object(df[columns_for_system_record_id], index=False).astype(
@@ -2224,7 +2231,7 @@ class FeaturesEnricher(TransformerMixin):
2224
2231
 
2225
2232
  combined_search_keys = combine_search_keys(search_keys.keys())
2226
2233
 
2227
- df_without_features = df.drop(columns=features_not_to_pass)
2234
+ df_without_features = df.drop(columns=features_not_to_pass, errors="ignore")
2228
2235
 
2229
2236
  df_without_features, full_duplicates_warning = clean_full_duplicates(
2230
2237
  df_without_features, self.logger, bundle=self.bundle
@@ -2339,7 +2346,9 @@ class FeaturesEnricher(TransformerMixin):
2339
2346
  if c not in self.dropped_client_feature_names_
2340
2347
  ]
2341
2348
  filtered_columns = self.__filtered_enriched_features(importance_threshold, max_features)
2342
- selecting_columns.extend(c for c in filtered_columns if c in result.columns and c not in validated_X.columns)
2349
+ selecting_columns.extend(
2350
+ c for c in filtered_columns if c in result.columns and c not in validated_X.columns
2351
+ )
2343
2352
  if add_fit_system_record_id:
2344
2353
  selecting_columns.append(SORT_ID)
2345
2354
 
@@ -2794,7 +2803,12 @@ class FeaturesEnricher(TransformerMixin):
2794
2803
  autofe_description = self.get_autofe_features_description()
2795
2804
  if autofe_description is not None:
2796
2805
  self.logger.info(f"AutoFE descriptions: {autofe_description}")
2797
- display_html_dataframe(autofe_description, autofe_description, "*Description of AutoFE feature names")
2806
+ self.autofe_features_display_handle = display_html_dataframe(
2807
+ df=autofe_description,
2808
+ internal_df=autofe_description,
2809
+ header=self.bundle.get("autofe_descriptions_header"),
2810
+ display_id="autofe_descriptions",
2811
+ )
2798
2812
 
2799
2813
  if self._has_paid_features(exclude_features_sources):
2800
2814
  if calculate_metrics is not None and calculate_metrics:
@@ -3506,7 +3520,9 @@ class FeaturesEnricher(TransformerMixin):
3506
3520
 
3507
3521
  return result_train, result_eval_sets
3508
3522
 
3509
- def __prepare_feature_importances(self, trace_id: str, x_columns: List[str], silent=False):
3523
+ def __prepare_feature_importances(
3524
+ self, trace_id: str, x_columns: List[str], updated_shaps: Optional[Dict[str, float]] = None, silent=False
3525
+ ):
3510
3526
  if self._search_task is None:
3511
3527
  raise NotFittedError(self.bundle.get("transform_unfitted_enricher"))
3512
3528
  features_meta = self._search_task.get_all_features_metadata_v2()
@@ -3523,6 +3539,10 @@ class FeaturesEnricher(TransformerMixin):
3523
3539
  features_info_without_links = []
3524
3540
  internal_features_info = []
3525
3541
 
3542
+ if updated_shaps is not None:
3543
+ for fm in features_meta:
3544
+ fm.shap_value = updated_shaps.get(fm.name, 0.0)
3545
+
3526
3546
  features_meta.sort(key=lambda m: (-m.shap_value, m.name))
3527
3547
  for feature_meta in features_meta:
3528
3548
  if feature_meta.name in original_names_dict.keys():
@@ -3544,7 +3564,6 @@ class FeaturesEnricher(TransformerMixin):
3544
3564
  ):
3545
3565
  continue
3546
3566
 
3547
-
3548
3567
  self.feature_names_.append(feature_meta.name)
3549
3568
  self.feature_importances_.append(_round_shap_value(feature_meta.shap_value))
3550
3569
 
@@ -3576,7 +3595,22 @@ class FeaturesEnricher(TransformerMixin):
3576
3595
  autofe_meta = self._search_task.get_autofe_metadata()
3577
3596
  if autofe_meta is None:
3578
3597
  return None
3579
- features_meta = self._search_task.get_all_features_metadata_v2()
3598
+ if len(self._internal_features_info) != 0:
3599
+
3600
+ def to_feature_meta(row):
3601
+ fm = FeaturesMetadataV2(
3602
+ name=row[bundle.get("features_info_name")],
3603
+ type="",
3604
+ source="",
3605
+ hit_rate=row[bundle.get("features_info_hitrate")],
3606
+ shap_value=row[bundle.get("features_info_shap")],
3607
+ data_source=row[bundle.get("features_info_source")],
3608
+ )
3609
+ return fm
3610
+
3611
+ features_meta = self._internal_features_info.apply(to_feature_meta, axis=1).to_list()
3612
+ else:
3613
+ features_meta = self._search_task.get_all_features_metadata_v2()
3580
3614
 
3581
3615
  def get_feature_by_name(name: str):
3582
3616
  for m in features_meta:
@@ -3605,27 +3639,32 @@ class FeaturesEnricher(TransformerMixin):
3605
3639
  self.logger.warning(f"Feature meta for display index {m.display_index} not found")
3606
3640
  continue
3607
3641
  description["shap"] = feature_meta.shap_value
3608
- description["Sources"] = feature_meta.data_source.replace("AutoFE: features from ", "").replace(
3609
- "AutoFE: feature from ", ""
3610
- )
3611
- description["Feature name"] = feature_meta.name
3642
+ description[self.bundle.get("autofe_descriptions_sources")] = feature_meta.data_source.replace(
3643
+ "AutoFE: features from ", ""
3644
+ ).replace("AutoFE: feature from ", "")
3645
+ description[self.bundle.get("autofe_descriptions_feature_name")] = feature_meta.name
3612
3646
 
3613
3647
  feature_idx = 1
3614
3648
  for bc in m.base_columns:
3615
- description[f"Feature {feature_idx}"] = bc.hashed_name
3649
+ description[self.bundle.get("autofe_descriptions_feature").format(feature_idx)] = bc.hashed_name
3616
3650
  feature_idx += 1
3617
3651
 
3618
- description["Function"] = ",".join(sorted(autofe_feature.get_all_operand_names()))
3652
+ description[self.bundle.get("autofe_descriptions_function")] = ",".join(
3653
+ sorted(autofe_feature.get_all_operand_names())
3654
+ )
3619
3655
 
3620
3656
  descriptions.append(description)
3621
3657
 
3622
3658
  if len(descriptions) == 0:
3623
3659
  return None
3624
3660
 
3625
- descriptions_df = pd.DataFrame(descriptions)
3626
- descriptions_df.fillna("", inplace=True)
3627
- descriptions_df.sort_values(by="shap", ascending=False, inplace=True)
3628
- descriptions_df.drop(columns="shap", inplace=True)
3661
+ descriptions_df = (
3662
+ pd.DataFrame(descriptions)
3663
+ .fillna("")
3664
+ .sort_values(by="shap", ascending=False)
3665
+ .drop(columns="shap")
3666
+ .reset_index(drop=True)
3667
+ )
3629
3668
  return descriptions_df
3630
3669
 
3631
3670
  except Exception:
@@ -882,7 +882,7 @@ class _RestClient:
882
882
  if content_type:
883
883
  headers[_RestClient.CONTENT_TYPE_HEADER_NAME] = content_type
884
884
  if trace_id:
885
- headers[_RestClient.TRACE_ID_HEADER_NAME] = trace_id
885
+ headers[_RestClient.TRACE_ID_HEADER_NAME] = str(trace_id)
886
886
  for header_key, header_value in additional_headers.items():
887
887
  headers[header_key] = header_value
888
888
  return headers
@@ -1,7 +1,7 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  from enum import Enum
4
- from typing import Any, Dict, List, Optional, Set, Union
4
+ from typing import Any, Dict, List, Optional, Union
5
5
 
6
6
  from pydantic import BaseModel
7
7
 
@@ -228,13 +228,13 @@ class FileMetadata(BaseModel):
228
228
  return c
229
229
  return None
230
230
 
231
- def search_types(self) -> Set[SearchKey]:
232
- search_keys = set()
231
+ def search_types(self) -> Dict[SearchKey, str]:
232
+ search_keys = dict()
233
233
  for keys_group in self.searchKeys:
234
234
  for key in keys_group:
235
235
  column = self.column_by_name(key)
236
236
  if column:
237
- search_keys.add(SearchKey.from_meaning_type(column.meaningType))
237
+ search_keys[SearchKey.from_meaning_type(column.meaningType)] = column.name
238
238
  return search_keys
239
239
 
240
240
 
@@ -251,6 +251,14 @@ relevant_data_sources_header=Relevant data sources
251
251
  relevant_data_sources_all_shap=All features SHAP
252
252
  relevant_data_sources_number=Number of relevant features
253
253
 
254
+ # Autofe descriptions
255
+ autofe_descriptions_header=*Description of AutoFE feature names
256
+ autofe_descriptions_sources=Sources
257
+ autofe_descriptions_feature_name=Feature name
258
+ autofe_descriptions_feature=Feature {}
259
+ autofe_descriptions_function=Function
260
+
261
+
254
262
  # Quality metrics table
255
263
  quality_metrics_header=Accuracy after enrichment
256
264
  quality_metrics_train_segment=Train
@@ -114,10 +114,12 @@ class DateTimeSearchKeyConverter:
114
114
  period_suffix = f"_{period}" if column != "day_in_quarter" else ""
115
115
  sin_feature = f"datetime_{column}_sin{period_suffix}"
116
116
  cos_feature = f"datetime_{column}_cos{period_suffix}"
117
- df[sin_feature] = np.sin(2 * np.pi * df[column] / period)
118
- df[cos_feature] = np.cos(2 * np.pi * df[column] / period)
119
- self.generated_features.append(sin_feature)
120
- self.generated_features.append(cos_feature)
117
+ if sin_feature not in df.columns:
118
+ df[sin_feature] = np.sin(2 * np.pi * df[column] / period)
119
+ self.generated_features.append(sin_feature)
120
+ if cos_feature not in df.columns:
121
+ df[cos_feature] = np.cos(2 * np.pi * df[column] / period)
122
+ self.generated_features.append(cos_feature)
121
123
 
122
124
  df["quarter"] = df[self.date_column].dt.quarter
123
125
 
@@ -38,8 +38,9 @@ class EmailDomainGenerator:
38
38
  def generate(self, df: pd.DataFrame) -> pd.DataFrame:
39
39
  for email_col in self.email_columns:
40
40
  domain_feature = email_col + self.DOMAIN_SUFFIX
41
- df[domain_feature] = df[email_col].apply(self._email_to_domain)
42
- self.generated_features.append(domain_feature)
41
+ if domain_feature not in df.columns:
42
+ df[domain_feature] = df[email_col].apply(self._email_to_domain).astype("string")
43
+ self.generated_features.append(domain_feature)
43
44
  return df
44
45
 
45
46
  @staticmethod
@@ -2,6 +2,7 @@ import logging
2
2
  from logging import Logger
3
3
  from typing import Dict, List, Optional, Tuple
4
4
 
5
+ import numpy as np
5
6
  import pandas as pd
6
7
  from pandas.api.types import is_integer_dtype, is_object_dtype, is_string_dtype
7
8
 
@@ -83,10 +84,21 @@ class FeaturesValidator:
83
84
  return [
84
85
  i
85
86
  for i in df
86
- if (is_object_dtype(df[i]) or is_string_dtype(df[i]) or is_integer_dtype(df[i]))
87
+ if (is_object_dtype(df[i]) or is_string_dtype(df[i]) or FeaturesValidator.__is_integer(df[i]))
87
88
  and (df[i].nunique(dropna=False) / row_count >= 0.85)
88
89
  ]
89
90
 
91
+ @staticmethod
92
+ def __is_integer(series: pd.Series) -> bool:
93
+ return (
94
+ is_integer_dtype(series)
95
+ or series.dropna()
96
+ .apply(
97
+ lambda f: (float.is_integer(f) and abs(f) < np.iinfo(np.int64).max) if isinstance(f, float) else False
98
+ )
99
+ .all()
100
+ )
101
+
90
102
  @staticmethod
91
103
  def find_constant_features(df: pd.DataFrame) -> List[str]:
92
104
  return [i for i in df if df[i].nunique() <= 1]
@@ -1 +0,0 @@
1
- __version__ = "1.2.29a6"
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes