upgini 1.2.29a7__py3-none-any.whl → 1.2.30__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

upgini/__about__.py CHANGED
@@ -1 +1 @@
1
- __version__ = "1.2.29a7"
1
+ __version__ = "1.2.30"
@@ -54,6 +54,7 @@ from upgini.metadata import (
54
54
  SYSTEM_RECORD_ID,
55
55
  TARGET,
56
56
  CVType,
57
+ FeaturesMetadataV2,
57
58
  FileColumnMeaningType,
58
59
  ModelTaskType,
59
60
  RuntimeParameters,
@@ -349,6 +350,7 @@ class FeaturesEnricher(TransformerMixin):
349
350
  self.add_date_if_missing = add_date_if_missing
350
351
  self.features_info_display_handle = None
351
352
  self.data_sources_display_handle = None
353
+ self.autofe_features_display_handle = None
352
354
  self.report_button_handle = None
353
355
 
354
356
  def _get_api_key(self):
@@ -1048,7 +1050,7 @@ class FeaturesEnricher(TransformerMixin):
1048
1050
  enriched_shaps = enriched_cv_result.shap_values
1049
1051
 
1050
1052
  if enriched_shaps is not None:
1051
- self._update_shap_values(enriched_shaps)
1053
+ self._update_shap_values(trace_id, validated_X.columns.to_list(), enriched_shaps)
1052
1054
 
1053
1055
  if enriched_metric is None:
1054
1056
  self.logger.warning(
@@ -1208,37 +1210,11 @@ class FeaturesEnricher(TransformerMixin):
1208
1210
  finally:
1209
1211
  self.logger.info(f"Calculating metrics elapsed time: {time.time() - start_time}")
1210
1212
 
1211
- def _update_shap_values(self, new_shaps: Dict[str, float]):
1213
+ def _update_shap_values(self, trace_id: str, x_columns: List[str], new_shaps: Dict[str, float]):
1212
1214
  new_shaps = {
1213
1215
  feature: _round_shap_value(shap) for feature, shap in new_shaps.items() if feature in self.feature_names_
1214
1216
  }
1215
- features_importances = list(new_shaps.items())
1216
- features_importances.sort(key=lambda m: (-m[1], m[0]))
1217
- self.feature_names_, self.feature_importances_ = zip(*features_importances)
1218
- self.feature_names_ = list(self.feature_names_)
1219
- self.feature_importances_ = list(self.feature_importances_)
1220
-
1221
- feature_name_header = self.bundle.get("features_info_name")
1222
- shap_value_header = self.bundle.get("features_info_shap")
1223
-
1224
- def update_shap(row):
1225
- return new_shaps.get(row[feature_name_header], row[shap_value_header])
1226
-
1227
- self.features_info[shap_value_header] = self.features_info.apply(update_shap, axis=1)
1228
- self._internal_features_info[shap_value_header] = self._internal_features_info.apply(update_shap, axis=1)
1229
- self._features_info_without_links[shap_value_header] = self._features_info_without_links.apply(
1230
- update_shap, axis=1
1231
- )
1232
- self.logger.info(f"Recalculated SHAP values:\n{self._features_info_without_links}")
1233
-
1234
- self.features_info.sort_values(by=shap_value_header, ascending=False, inplace=True)
1235
- self._internal_features_info.sort_values(by=shap_value_header, ascending=False, inplace=True)
1236
- self._features_info_without_links.sort_values(by=shap_value_header, ascending=False, inplace=True)
1237
-
1238
- self.relevant_data_sources = self._group_relevant_data_sources(self.features_info, self.bundle)
1239
- self._relevant_data_sources_wo_links = self._group_relevant_data_sources(
1240
- self._features_info_without_links, self.bundle
1241
- )
1217
+ self.__prepare_feature_importances(trace_id, x_columns, new_shaps, silent=True)
1242
1218
 
1243
1219
  if self.features_info_display_handle is not None:
1244
1220
  try:
@@ -1251,7 +1227,7 @@ class FeaturesEnricher(TransformerMixin):
1251
1227
  display_handle=self.features_info_display_handle,
1252
1228
  )
1253
1229
  except (ImportError, NameError):
1254
- print(self._internal_features_info)
1230
+ pass
1255
1231
  if self.data_sources_display_handle is not None:
1256
1232
  try:
1257
1233
  _ = get_ipython() # type: ignore
@@ -1259,11 +1235,24 @@ class FeaturesEnricher(TransformerMixin):
1259
1235
  display_html_dataframe(
1260
1236
  self.relevant_data_sources,
1261
1237
  self._relevant_data_sources_wo_links,
1262
- self.bundle.get("relevant_features_header"),
1238
+ self.bundle.get("relevant_data_sources_header"),
1263
1239
  display_handle=self.data_sources_display_handle,
1264
1240
  )
1265
1241
  except (ImportError, NameError):
1266
- print(self._relevant_data_sources_wo_links)
1242
+ pass
1243
+ if self.autofe_features_display_handle is not None:
1244
+ try:
1245
+ _ = get_ipython() # type: ignore
1246
+ autofe_descriptions_df = self.get_autofe_features_description()
1247
+ if autofe_descriptions_df is not None:
1248
+ display_html_dataframe(
1249
+ df=autofe_descriptions_df,
1250
+ internal_df=autofe_descriptions_df,
1251
+ header=self.bundle.get("autofe_descriptions_header"),
1252
+ display_handle=self.autofe_features_display_handle,
1253
+ )
1254
+ except (ImportError, NameError):
1255
+ pass
1267
1256
  if self.report_button_handle is not None:
1268
1257
  try:
1269
1258
  _ = get_ipython() # type: ignore
@@ -1999,9 +1988,19 @@ class FeaturesEnricher(TransformerMixin):
1999
1988
  file_metadata = self._search_task.get_file_metadata(str(uuid.uuid4()))
2000
1989
  search_keys = file_metadata.search_types()
2001
1990
  if SearchKey.IPV6_ADDRESS in search_keys:
2002
- search_keys.remove(SearchKey.IPV6_ADDRESS)
1991
+ # search_keys.remove(SearchKey.IPV6_ADDRESS)
1992
+ search_keys.pop(SearchKey.IPV6_ADDRESS, None)
2003
1993
 
2004
- keys = "{" + ", ".join([f'"{key.name}": "{key_example(key)}"' for key in search_keys]) + "}"
1994
+ keys = (
1995
+ "{"
1996
+ + ", ".join(
1997
+ [
1998
+ f'"{key.name}": {{"name": "{name}", "value": "{key_example(key)}"}}'
1999
+ for key, name in search_keys.items()
2000
+ ]
2001
+ )
2002
+ + "}"
2003
+ )
2005
2004
  features_for_transform = self._search_task.get_features_for_transform()
2006
2005
  if features_for_transform:
2007
2006
  original_features_for_transform = [
@@ -2804,7 +2803,12 @@ class FeaturesEnricher(TransformerMixin):
2804
2803
  autofe_description = self.get_autofe_features_description()
2805
2804
  if autofe_description is not None:
2806
2805
  self.logger.info(f"AutoFE descriptions: {autofe_description}")
2807
- display_html_dataframe(autofe_description, autofe_description, "*Description of AutoFE feature names")
2806
+ self.autofe_features_display_handle = display_html_dataframe(
2807
+ df=autofe_description,
2808
+ internal_df=autofe_description,
2809
+ header=self.bundle.get("autofe_descriptions_header"),
2810
+ display_id="autofe_descriptions",
2811
+ )
2808
2812
 
2809
2813
  if self._has_paid_features(exclude_features_sources):
2810
2814
  if calculate_metrics is not None and calculate_metrics:
@@ -3516,7 +3520,9 @@ class FeaturesEnricher(TransformerMixin):
3516
3520
 
3517
3521
  return result_train, result_eval_sets
3518
3522
 
3519
- def __prepare_feature_importances(self, trace_id: str, x_columns: List[str], silent=False):
3523
+ def __prepare_feature_importances(
3524
+ self, trace_id: str, x_columns: List[str], updated_shaps: Optional[Dict[str, float]] = None, silent=False
3525
+ ):
3520
3526
  if self._search_task is None:
3521
3527
  raise NotFittedError(self.bundle.get("transform_unfitted_enricher"))
3522
3528
  features_meta = self._search_task.get_all_features_metadata_v2()
@@ -3533,6 +3539,10 @@ class FeaturesEnricher(TransformerMixin):
3533
3539
  features_info_without_links = []
3534
3540
  internal_features_info = []
3535
3541
 
3542
+ if updated_shaps is not None:
3543
+ for fm in features_meta:
3544
+ fm.shap_value = updated_shaps.get(fm.name, 0.0)
3545
+
3536
3546
  features_meta.sort(key=lambda m: (-m.shap_value, m.name))
3537
3547
  for feature_meta in features_meta:
3538
3548
  if feature_meta.name in original_names_dict.keys():
@@ -3585,7 +3595,22 @@ class FeaturesEnricher(TransformerMixin):
3585
3595
  autofe_meta = self._search_task.get_autofe_metadata()
3586
3596
  if autofe_meta is None:
3587
3597
  return None
3588
- features_meta = self._search_task.get_all_features_metadata_v2()
3598
+ if len(self._internal_features_info) != 0:
3599
+
3600
+ def to_feature_meta(row):
3601
+ fm = FeaturesMetadataV2(
3602
+ name=row[bundle.get("features_info_name")],
3603
+ type="",
3604
+ source="",
3605
+ hit_rate=row[bundle.get("features_info_hitrate")],
3606
+ shap_value=row[bundle.get("features_info_shap")],
3607
+ data_source=row[bundle.get("features_info_source")],
3608
+ )
3609
+ return fm
3610
+
3611
+ features_meta = self._internal_features_info.apply(to_feature_meta, axis=1).to_list()
3612
+ else:
3613
+ features_meta = self._search_task.get_all_features_metadata_v2()
3589
3614
 
3590
3615
  def get_feature_by_name(name: str):
3591
3616
  for m in features_meta:
@@ -3614,27 +3639,32 @@ class FeaturesEnricher(TransformerMixin):
3614
3639
  self.logger.warning(f"Feature meta for display index {m.display_index} not found")
3615
3640
  continue
3616
3641
  description["shap"] = feature_meta.shap_value
3617
- description["Sources"] = feature_meta.data_source.replace("AutoFE: features from ", "").replace(
3618
- "AutoFE: feature from ", ""
3619
- )
3620
- description["Feature name"] = feature_meta.name
3642
+ description[self.bundle.get("autofe_descriptions_sources")] = feature_meta.data_source.replace(
3643
+ "AutoFE: features from ", ""
3644
+ ).replace("AutoFE: feature from ", "")
3645
+ description[self.bundle.get("autofe_descriptions_feature_name")] = feature_meta.name
3621
3646
 
3622
3647
  feature_idx = 1
3623
3648
  for bc in m.base_columns:
3624
- description[f"Feature {feature_idx}"] = bc.hashed_name
3649
+ description[self.bundle.get("autofe_descriptions_feature").format(feature_idx)] = bc.hashed_name
3625
3650
  feature_idx += 1
3626
3651
 
3627
- description["Function"] = ",".join(sorted(autofe_feature.get_all_operand_names()))
3652
+ description[self.bundle.get("autofe_descriptions_function")] = ",".join(
3653
+ sorted(autofe_feature.get_all_operand_names())
3654
+ )
3628
3655
 
3629
3656
  descriptions.append(description)
3630
3657
 
3631
3658
  if len(descriptions) == 0:
3632
3659
  return None
3633
3660
 
3634
- descriptions_df = pd.DataFrame(descriptions)
3635
- descriptions_df.fillna("", inplace=True)
3636
- descriptions_df.sort_values(by="shap", ascending=False, inplace=True)
3637
- descriptions_df.drop(columns="shap", inplace=True)
3661
+ descriptions_df = (
3662
+ pd.DataFrame(descriptions)
3663
+ .fillna("")
3664
+ .sort_values(by="shap", ascending=False)
3665
+ .drop(columns="shap")
3666
+ .reset_index(drop=True)
3667
+ )
3638
3668
  return descriptions_df
3639
3669
 
3640
3670
  except Exception:
upgini/metadata.py CHANGED
@@ -1,7 +1,7 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  from enum import Enum
4
- from typing import Any, Dict, List, Optional, Set, Union
4
+ from typing import Any, Dict, List, Optional, Union
5
5
 
6
6
  from pydantic import BaseModel
7
7
 
@@ -228,13 +228,13 @@ class FileMetadata(BaseModel):
228
228
  return c
229
229
  return None
230
230
 
231
- def search_types(self) -> Set[SearchKey]:
232
- search_keys = set()
231
+ def search_types(self) -> Dict[SearchKey, str]:
232
+ search_keys = dict()
233
233
  for keys_group in self.searchKeys:
234
234
  for key in keys_group:
235
235
  column = self.column_by_name(key)
236
236
  if column:
237
- search_keys.add(SearchKey.from_meaning_type(column.meaningType))
237
+ search_keys[SearchKey.from_meaning_type(column.meaningType)] = column.name
238
238
  return search_keys
239
239
 
240
240
 
@@ -251,6 +251,14 @@ relevant_data_sources_header=Relevant data sources
251
251
  relevant_data_sources_all_shap=All features SHAP
252
252
  relevant_data_sources_number=Number of relevant features
253
253
 
254
+ # Autofe descriptions
255
+ autofe_descriptions_header=*Description of AutoFE feature names
256
+ autofe_descriptions_sources=Sources
257
+ autofe_descriptions_feature_name=Feature name
258
+ autofe_descriptions_feature=Feature {}
259
+ autofe_descriptions_function=Function
260
+
261
+
254
262
  # Quality metrics table
255
263
  quality_metrics_header=Accuracy after enrichment
256
264
  quality_metrics_train_segment=Train
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: upgini
3
- Version: 1.2.29a7
3
+ Version: 1.2.30
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -1,12 +1,12 @@
1
- upgini/__about__.py,sha256=zQC-_yiNcwPq8o3NPpgr0tGKyMXiUXgF1aIDtN0fDEk,25
1
+ upgini/__about__.py,sha256=WGj1CvVJizDkAvd9BtLpwcsI-hzacJoXGbC8sVpoHYk,23
2
2
  upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
3
3
  upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
4
4
  upgini/dataset.py,sha256=iPFiMJtk4HF1ytw9wCQr8H9RfoOKj_TIo8XYZKWgcMc,31331
5
5
  upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
6
- upgini/features_enricher.py,sha256=tBdArcifhTLuDIL4D_eRB1gIBt9ayTVU7Ox2fVKE68c,191300
6
+ upgini/features_enricher.py,sha256=lNfu5Z40NmkkGJScKAwe_0VBtL8liePifuAlKE_flfA,192053
7
7
  upgini/http.py,sha256=plZGTGoi1h2edd8Cnjt4eYB8t4NbBGnZz7DtPTByiNc,42885
8
8
  upgini/lazy_import.py,sha256=74gQ8JuA48BGRLxAo7lNHNKY2D2emMxrUxKGdxVGhuY,1012
9
- upgini/metadata.py,sha256=osmzdNESeh7yP3BZday6N9Q3eaIHfzhhRM1d6NSgcf0,11223
9
+ upgini/metadata.py,sha256=lUa2xYhBhnCeTqNt6lWc9iP_YuikYGIsDSn8Vwyjv1I,11235
10
10
  upgini/metrics.py,sha256=hr7UwLphbZ_FEglLuO2lzr_pFgxOJ4c3WBeg7H-fNqY,35521
11
11
  upgini/search_task.py,sha256=qxUxAD-bed-FpZYmTB_4orW7YJsW_O6a1TcgnZIRFr4,17307
12
12
  upgini/spinner.py,sha256=4iMd-eIe_BnkqFEMIliULTbj6rNI2HkN_VJ4qYe0cUc,1118
@@ -30,7 +30,7 @@ upgini/normalizer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU
30
30
  upgini/normalizer/normalize_utils.py,sha256=Ft2MwSgVoBilXAORAOYAuwPD79GOLfwn4qQE3IUFzzg,7218
31
31
  upgini/resource_bundle/__init__.py,sha256=S5F2G47pnJd2LDpmFsjDqEwiKkP8Hm-hcseDbMka6Ko,8345
32
32
  upgini/resource_bundle/exceptions.py,sha256=5fRvx0_vWdE1-7HcSgF0tckB4A9AKyf5RiinZkInTsI,621
33
- upgini/resource_bundle/strings.properties,sha256=fOAeLTsnx8xvJK-7RPFXprATG0n56jeCdse8sQTuVX8,26674
33
+ upgini/resource_bundle/strings.properties,sha256=bKw_rjZZTomLJhQBqiM7_P2EoRq45_Ng2gP4WE6MRBE,26921
34
34
  upgini/resource_bundle/strings_widget.properties,sha256=gOdqvZWntP2LCza_tyVk1_yRYcG4c04K9sQOAVhF_gw,1577
35
35
  upgini/sampler/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
36
36
  upgini/sampler/base.py,sha256=7GpjYqjOp58vYcJLiX__1R5wjUlyQbxvHJ2klFnup_M,6389
@@ -59,7 +59,7 @@ upgini/utils/sklearn_ext.py,sha256=13jQS_k7v0aUtudXV6nGUEWjttPQzAW9AFYL5wgEz9k,4
59
59
  upgini/utils/target_utils.py,sha256=PU77nIhTz7IHbC4rpTpxrVxib6cdpRL9F1dhkjIffLY,10225
60
60
  upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
61
61
  upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
62
- upgini-1.2.29a7.dist-info/METADATA,sha256=sE4t490pcKTOegDZx5S7gX4eh9j_pk8zHl5xKe8Qy08,48580
63
- upgini-1.2.29a7.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
64
- upgini-1.2.29a7.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
65
- upgini-1.2.29a7.dist-info/RECORD,,
62
+ upgini-1.2.30.dist-info/METADATA,sha256=93iMDL28nXF2DJaDP-oUS2CeCFHgnjk2zmAhge2LAHg,48578
63
+ upgini-1.2.30.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
64
+ upgini-1.2.30.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
65
+ upgini-1.2.30.dist-info/RECORD,,