upgini 1.2.71a3832.dev9__tar.gz → 1.2.71a3832.dev10__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (78) hide show
  1. {upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/PKG-INFO +1 -1
  2. upgini-1.2.71a3832.dev10/src/upgini/__about__.py +1 -0
  3. {upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/src/upgini/features_enricher.py +50 -3
  4. {upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/src/upgini/metrics.py +31 -20
  5. upgini-1.2.71a3832.dev9/src/upgini/__about__.py +0 -1
  6. {upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/.gitignore +0 -0
  7. {upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/LICENSE +0 -0
  8. {upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/README.md +0 -0
  9. {upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/pyproject.toml +0 -0
  10. {upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/src/upgini/__init__.py +0 -0
  11. {upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/src/upgini/ads.py +0 -0
  12. {upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/src/upgini/ads_management/__init__.py +0 -0
  13. {upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/src/upgini/ads_management/ads_manager.py +0 -0
  14. {upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/src/upgini/autofe/__init__.py +0 -0
  15. {upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/src/upgini/autofe/all_operators.py +0 -0
  16. {upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/src/upgini/autofe/binary.py +0 -0
  17. {upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/src/upgini/autofe/date.py +0 -0
  18. {upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/src/upgini/autofe/feature.py +0 -0
  19. {upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/src/upgini/autofe/groupby.py +0 -0
  20. {upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/src/upgini/autofe/operator.py +0 -0
  21. {upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/src/upgini/autofe/timeseries/__init__.py +0 -0
  22. {upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/src/upgini/autofe/timeseries/base.py +0 -0
  23. {upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/src/upgini/autofe/timeseries/cross.py +0 -0
  24. {upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/src/upgini/autofe/timeseries/delta.py +0 -0
  25. {upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/src/upgini/autofe/timeseries/lag.py +0 -0
  26. {upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/src/upgini/autofe/timeseries/roll.py +0 -0
  27. {upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/src/upgini/autofe/timeseries/trend.py +0 -0
  28. {upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/src/upgini/autofe/timeseries/volatility.py +0 -0
  29. {upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/src/upgini/autofe/unary.py +0 -0
  30. {upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/src/upgini/autofe/utils.py +0 -0
  31. {upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/src/upgini/autofe/vector.py +0 -0
  32. {upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/src/upgini/data_source/__init__.py +0 -0
  33. {upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/src/upgini/data_source/data_source_publisher.py +0 -0
  34. {upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/src/upgini/dataset.py +0 -0
  35. {upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/src/upgini/errors.py +0 -0
  36. {upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/src/upgini/http.py +0 -0
  37. {upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/src/upgini/mdc/__init__.py +0 -0
  38. {upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/src/upgini/mdc/context.py +0 -0
  39. {upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/src/upgini/metadata.py +0 -0
  40. {upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/src/upgini/normalizer/__init__.py +0 -0
  41. {upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/src/upgini/normalizer/normalize_utils.py +0 -0
  42. {upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/src/upgini/resource_bundle/__init__.py +0 -0
  43. {upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/src/upgini/resource_bundle/exceptions.py +0 -0
  44. {upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/src/upgini/resource_bundle/strings.properties +0 -0
  45. {upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/src/upgini/resource_bundle/strings_widget.properties +0 -0
  46. {upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/src/upgini/sampler/__init__.py +0 -0
  47. {upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/src/upgini/sampler/base.py +0 -0
  48. {upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/src/upgini/sampler/random_under_sampler.py +0 -0
  49. {upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/src/upgini/sampler/utils.py +0 -0
  50. {upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/src/upgini/search_task.py +0 -0
  51. {upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/src/upgini/spinner.py +0 -0
  52. {upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/src/upgini/utils/Roboto-Regular.ttf +0 -0
  53. {upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/src/upgini/utils/__init__.py +0 -0
  54. {upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/src/upgini/utils/base_search_key_detector.py +0 -0
  55. {upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/src/upgini/utils/blocked_time_series.py +0 -0
  56. {upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/src/upgini/utils/country_utils.py +0 -0
  57. {upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/src/upgini/utils/custom_loss_utils.py +0 -0
  58. {upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/src/upgini/utils/cv_utils.py +0 -0
  59. {upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/src/upgini/utils/datetime_utils.py +0 -0
  60. {upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/src/upgini/utils/deduplicate_utils.py +0 -0
  61. {upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/src/upgini/utils/display_utils.py +0 -0
  62. {upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/src/upgini/utils/email_utils.py +0 -0
  63. {upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/src/upgini/utils/fallback_progress_bar.py +0 -0
  64. {upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/src/upgini/utils/feature_info.py +0 -0
  65. {upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/src/upgini/utils/features_validator.py +0 -0
  66. {upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/src/upgini/utils/format.py +0 -0
  67. {upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/src/upgini/utils/ip_utils.py +0 -0
  68. {upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/src/upgini/utils/mstats.py +0 -0
  69. {upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/src/upgini/utils/phone_utils.py +0 -0
  70. {upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/src/upgini/utils/postal_code_utils.py +0 -0
  71. {upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/src/upgini/utils/progress_bar.py +0 -0
  72. {upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/src/upgini/utils/sklearn_ext.py +0 -0
  73. {upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/src/upgini/utils/sort.py +0 -0
  74. {upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/src/upgini/utils/target_utils.py +0 -0
  75. {upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/src/upgini/utils/track_info.py +0 -0
  76. {upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/src/upgini/utils/ts_utils.py +0 -0
  77. {upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/src/upgini/utils/warning_counter.py +0 -0
  78. {upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/src/upgini/version_validator.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: upgini
3
- Version: 1.2.71a3832.dev9
3
+ Version: 1.2.71a3832.dev10
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -0,0 +1 @@
1
+ __version__ = "1.2.71a3832.dev10"
@@ -1514,6 +1514,8 @@ class FeaturesEnricher(TransformerMixin):
1514
1514
  filtered_enriched_features = self.__filtered_enriched_features(
1515
1515
  importance_threshold,
1516
1516
  max_features,
1517
+ trace_id,
1518
+ validated_X
1517
1519
  )
1518
1520
  filtered_enriched_features = [c for c in filtered_enriched_features if c not in client_features]
1519
1521
 
@@ -3805,6 +3807,46 @@ if response.status_code == 200:
3805
3807
 
3806
3808
  return result_features
3807
3809
 
3810
+ def __get_features_importance_from_server(self, trace_id: str, df: pd.DataFrame):
3811
+ if self._search_task is None:
3812
+ raise NotFittedError(self.bundle.get("transform_unfitted_enricher"))
3813
+ features_meta = self._search_task.get_all_features_metadata_v2()
3814
+ if features_meta is None:
3815
+ raise Exception(self.bundle.get("missing_features_meta"))
3816
+
3817
+ original_names_dict = {c.name: c.originalName for c in self._search_task.get_file_metadata(trace_id).columns}
3818
+ df = df.rename(columns=original_names_dict)
3819
+
3820
+ features_meta.sort(key=lambda m: (-m.shap_value, m.name))
3821
+
3822
+ importances = {}
3823
+
3824
+ for feature_meta in features_meta:
3825
+ if feature_meta.name in original_names_dict.keys():
3826
+ feature_meta.name = original_names_dict[feature_meta.name]
3827
+
3828
+ is_client_feature = feature_meta.name in df.columns
3829
+
3830
+ if feature_meta.shap_value == 0.0:
3831
+ continue
3832
+
3833
+ # Use only important features
3834
+ if (
3835
+ feature_meta.name == COUNTRY
3836
+ # In select_features mode we select also from etalon features and need to show them
3837
+ or (not self.fit_select_features and is_client_feature)
3838
+ ):
3839
+ continue
3840
+
3841
+ # Temporary workaround for duplicate features metadata
3842
+ if feature_meta.name in self.feature_names_:
3843
+ self.logger.warning(f"WARNING: Duplicate feature metadata: {feature_meta}")
3844
+ continue
3845
+
3846
+ importances[feature_meta.name] = feature_meta.shap_value
3847
+
3848
+ return importances
3849
+
3808
3850
  def __prepare_feature_importances(
3809
3851
  self, trace_id: str, df: pd.DataFrame, updated_shaps: Optional[Dict[str, float]] = None, silent=False
3810
3852
  ):
@@ -3990,9 +4032,12 @@ if response.status_code == 200:
3990
4032
  )
3991
4033
 
3992
4034
  def __filtered_importance_names(
3993
- self, importance_threshold: Optional[float], max_features: Optional[int]
4035
+ self, importance_threshold: Optional[float], max_features: Optional[int], trace_id: str, df: pd.DataFrame
3994
4036
  ) -> List[str]:
3995
- if len(self.feature_names_) == 0:
4037
+ # get features importance from server
4038
+ filtered_importances = self.__get_features_importance_from_server(trace_id, df)
4039
+
4040
+ if len(filtered_importances) == 0:
3996
4041
  return []
3997
4042
 
3998
4043
  filtered_importances = list(zip(self.feature_names_, self.feature_importances_))
@@ -4212,11 +4257,13 @@ if response.status_code == 200:
4212
4257
  self,
4213
4258
  importance_threshold: Optional[float],
4214
4259
  max_features: Optional[int],
4260
+ trace_id: str,
4261
+ df: pd.DataFrame,
4215
4262
  ) -> List[str]:
4216
4263
  importance_threshold = self.__validate_importance_threshold(importance_threshold)
4217
4264
  max_features = self.__validate_max_features(max_features)
4218
4265
 
4219
- return self.__filtered_importance_names(importance_threshold, max_features)
4266
+ return self.__filtered_importance_names(importance_threshold, max_features, trace_id, df)
4220
4267
 
4221
4268
  def __detect_missing_search_keys(
4222
4269
  self,
@@ -3,7 +3,6 @@ from __future__ import annotations
3
3
  import inspect
4
4
  import logging
5
5
  import re
6
- import warnings
7
6
  from collections import defaultdict
8
7
  from copy import deepcopy
9
8
  from dataclasses import dataclass
@@ -755,9 +754,12 @@ class LightGBMWrapper(EstimatorWrapper):
755
754
  logger=logger,
756
755
  )
757
756
  self.cat_features = None
757
+ self.n_classes = None
758
758
 
759
759
  def _prepare_to_fit(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, pd.Series, np.ndarray, dict]:
760
760
  x, y_numpy, groups, params = super()._prepare_to_fit(x, y)
761
+ if self.target_type in [ModelTaskType.BINARY, ModelTaskType.MULTICLASS]:
762
+ self.n_classes = len(np.unique(y_numpy))
761
763
  if LIGHTGBM_EARLY_STOPPING_ROUNDS is not None:
762
764
  params["callbacks"] = [lgb.early_stopping(stopping_rounds=LIGHTGBM_EARLY_STOPPING_ROUNDS, verbose=False)]
763
765
  self.cat_features = _get_cat_features(x)
@@ -783,31 +785,40 @@ class LightGBMWrapper(EstimatorWrapper):
783
785
 
784
786
  def calculate_shap(self, x: pd.DataFrame, y: pd.Series, estimator) -> Optional[Dict[str, float]]:
785
787
  try:
786
- # Suppress specific warning from SHAP for LightGBM binary classifier
787
- warnings.filterwarnings(
788
- "ignore",
789
- message=(
790
- "LightGBM binary classifier with TreeExplainer shap values output has changed to a list of ndarray"
791
- ),
788
+ shap_matrix = estimator.predict(
789
+ x,
790
+ predict_disable_shape_check=True,
791
+ raw_score=True,
792
+ pred_leaf=False,
793
+ pred_early_stop=True,
794
+ pred_contrib=True,
792
795
  )
793
- from shap import TreeExplainer
794
-
795
- if not isinstance(estimator, (LGBMRegressor, LGBMClassifier)):
796
- return None
797
796
 
798
- explainer = TreeExplainer(estimator)
799
-
800
- shap_values = explainer.shap_values(x)
797
+ if self.target_type == ModelTaskType.MULTICLASS:
798
+ n_feat = x.shape[1]
799
+ shap_matrix.shape = (shap_matrix.shape[0], self.n_classes, n_feat + 1)
800
+ shap_matrix = np.mean(np.abs(shap_matrix), axis=1)
801
801
 
802
- # For classification, shap_values is returned as a list for each class
803
- # Take values for the positive class
804
- if isinstance(shap_values, list):
805
- shap_values = shap_values[1]
802
+ # exclude base value
803
+ shap_matrix = shap_matrix[:, :-1]
806
804
 
807
- # Calculate mean absolute SHAP value for each feature
808
805
  feature_importance = {}
809
806
  for i, col in enumerate(x.columns):
810
- feature_importance[col] = np.mean(np.abs(shap_values[:, i]))
807
+ feature_importance[col] = np.mean(np.abs(shap_matrix[:, i]))
808
+
809
+ # # exclude last column (base value)
810
+ # shap_values_only = shap_values[:, :-1]
811
+ # mean_abs_shap = np.mean(np.abs(shap_values_only), axis=0)
812
+
813
+ # # For classification, shap_values is returned as a list for each class
814
+ # # Take values for the positive class
815
+ # if isinstance(shap_values, list):
816
+ # shap_values = shap_values[1]
817
+
818
+ # # Calculate mean absolute SHAP value for each feature
819
+ # feature_importance = {}
820
+ # for i, col in enumerate(x.columns):
821
+ # feature_importance[col] = np.mean(np.abs(shap_values[:, i]))
811
822
 
812
823
  return feature_importance
813
824
 
@@ -1 +0,0 @@
1
- __version__ = "1.2.71a3832.dev9"